You are viewing a plain text version of this content. The canonical link for it is here.
Posted to by on 2017/01/24 08:52:24 UTC

[1/2] incubator-singa git commit: SINGA-295 - Add an example of image classification using GoogleNet

Repository: incubator-singa
Updated Branches:
  refs/heads/master 8101f0066 -> d190fa89a
diff --git a/examples/imagenet/ilsvrc12.h b/examples/imagenet/ilsvrc12.h
deleted file mode 100644
index 74fffbb..0000000
--- a/examples/imagenet/ilsvrc12.h
+++ /dev/null
@@ -1,376 +0,0 @@
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-#include "singa/singa_config.h"
-#ifdef USE_OPENCV
-#include <omp.h>
-#include <cstdint>
-#include <fstream>
-#include <iostream>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <string>
-#include <thread>
-#include <vector>
-#include "singa/core/tensor.h"
-#include "singa/io/decoder.h"
-#include "singa/io/encoder.h"
-#include "singa/io/reader.h"
-#include "singa/io/transformer.h"
-#include "singa/io/writer.h"
-#include "singa/proto/io.pb.h"
-#include "singa/utils/timer.h"
-using std::string;
-using namespace singa::io;
-namespace singa {
-/// For reading ILSVRC2012 image data as tensors.
-class ILSVRC {
- public:
-  /// Setup encoder, decoder
-  ILSVRC();
-  ~ILSVRC() {
-    if (encoder != nullptr) delete encoder;
-    if (decoder != nullptr) delete decoder;
-    if (transformer != nullptr) delete transformer;
-    if (reader != nullptr) {
-      reader->Close();
-      delete reader;
-    }
-    if (writer != nullptr) {
-      writer->Close();
-      delete writer;
-    }
-  }
-  /// Create binary files for training data
-  /// train_image_list: list file of training images
-  /// train_image_folder: folder where stores original training images
-  /// train_bin_folder: folder to store binary files
-  /// train_file_size: number of images that are contain in one binary file
-  void CreateTrainData(string train_image_list, string train_image_folder,
-                       string train_bin_folder, size_t train_file_size);
-  /// Create binary files for test data
-  /// train_image_list: list file of test images
-  /// train_image_folder: folder where saves original test images
-  /// train_bin_folder: folder to save binary files
-  void CreateTestData(string test_image_list, string test_image_folder,
-                      string test_bin_folder);
-  /// Load data from a binary file,  return <images, labels> pair
-  /// suppose the data will be loaded file by file.
-  /// flag: kTrain or kTest
-  /// file: binary file which stores the images
-  /// read_size: number of images to be loaded
-  /// offset: offset in the file
-  /// n_read: number of images which are read
-  size_t LoadData(int flag, string file, size_t read_size, Tensor *x, Tensor *y,
-                  size_t *n_read, int nthreads);
-  /// A wrapper method to spawn a thread to execute LoadData() method.
-  std::thread AsyncLoadData(int flag, string file, size_t read_size, Tensor *x,
-                            Tensor *y, size_t *n_read, int nthreads);
-  void DecodeTransform(int flag, int thid, int nthreads,
-                       vector<string *> images, Tensor *x, Tensor *y);
-  /// A wrapper method to spawn a thread to execute Decodetransform() method.
-  std::thread AsyncDecodeTransform(int flag, int thid, int nthreads,
-                                   vector<string *> images, Tensor *x,
-                                   Tensor *y);
-  /// Read mean from path
-  void ReadMean(string path);
- protected:
-  /// Read one image at path, resize the image
-  Tensor ReadImage(string path);
-  /// Write buff to the file in kCreate/kAppend mode
-  void Write(string outfile, singa::io::Mode mode);
-  void WriteMean(Tensor &mean, string path);
- private:
-  /// size for resizing
-  const size_t kImageSize = 256;
-  const size_t kImageNBytes = 3 * kImageSize * kImageSize;
-  /// size for cropping
-  const size_t kCropSize = 227;
-  Tensor mean;
-  string last_read_file = "";
-  JPGEncoder *encoder = nullptr;
-  JPGDecoder *decoder = nullptr;
-  ImageTransformer *transformer = nullptr;
-  BinFileReader *reader = nullptr;
-  BinFileWriter *writer = nullptr;
-  EncoderConf en_conf;
-  en_conf.set_image_dim_order("CHW");
-  encoder = new JPGEncoder();
-  encoder->Setup(en_conf);
-  DecoderConf de_conf;
-  de_conf.set_image_dim_order("CHW");
-  decoder = new JPGDecoder();
-  decoder->Setup(de_conf);
-  TransformerConf trans_conf;
-  trans_conf.add_crop_shape(kCropSize);
-  trans_conf.add_crop_shape(kCropSize);
-  trans_conf.set_image_dim_order("CHW");
-  trans_conf.set_horizontal_mirror(true);
-  transformer = new ImageTransformer();
-  transformer->Setup(trans_conf);
-Tensor ILSVRC::ReadImage(string path) {
-  cv::Mat mat = cv::imread(path, CV_LOAD_IMAGE_COLOR);
-  CHECK( != NULL) << "OpenCV load image fail: " << path;
-  cv::Size size(kImageSize, kImageSize);
-  cv::Mat resized;
-  cv::resize(mat, resized, size);
-  CHECK_EQ((size_t)resized.size().height, kImageSize);
-  CHECK_EQ((size_t)resized.size().width, kImageSize);
-  // dimension_order: CHW
-  Shape shape{(size_t)resized.channels(), (size_t)resized.rows,
-              (size_t)resized.cols};
-  Tensor image(shape, singa::kUChar);
-  unsigned char *data = new unsigned char[kImageNBytes];
-  for (int i = 0; i < resized.rows; i++)
-    for (int j = 0; j < resized.cols; j++)
-      for (int k = 0; k < resized.channels(); k++)
-        data[k * kImageSize * kImageSize + i * kImageSize + j] =
-  <cv::Vec3b>(i, j)[k];
-  image.CopyDataFromHostPtr<unsigned char>(data, kImageNBytes);
-  delete[] data;
-  return image;
-void ILSVRC::WriteMean(Tensor &mean, string path) {
-  Tensor mean_lb(Shape{1}, kInt);
-  std::vector<Tensor> input;
-  input.push_back(mean);
-  input.push_back(mean_lb);
-  BinFileWriter bfwriter;
-  bfwriter.Open(path, kCreate);
-  bfwriter.Write(path, encoder->Encode(input));
-  bfwriter.Flush();
-  bfwriter.Close();
-void ILSVRC::CreateTrainData(string image_list, string input_folder,
-                             string output_folder, size_t file_size = 12800) {
-  std::vector<std::pair<string, int>> file_list;
-  size_t *sum = new size_t[kImageNBytes];
-  for (size_t i = 0; i < kImageNBytes; i++) sum[i] = 0u;
-  string image_file_name;
-  int label;
-  string outfile;
-  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
-  while (image_list_file >> image_file_name >> label)
-    file_list.push_back(std::make_pair(image_file_name, label));
-  LOG(INFO) << "Data Shuffling";
-  std::shuffle(file_list.begin(), file_list.end(),
-               std::default_random_engine());
-  LOG(INFO) << "Total number of training images is " << file_list.size();
-  size_t num_train_images = file_list.size();
-  if (file_size == 0) file_size = num_train_images;
-  for (size_t imageid = 0; imageid < num_train_images; imageid++) {
-    string path = input_folder + "/" + file_list[imageid].first;
-    Tensor image = ReadImage(path);
-    auto image_data =<unsigned char>();
-    for (size_t i = 0; i < kImageNBytes; i++)
-      sum[i] += static_cast<size_t>(image_data[i]);
-    label = file_list[imageid].second;
-    Tensor lb(Shape{1}, kInt);
-    lb.CopyDataFromHostPtr<int>(&label, 1);
-    std::vector<Tensor> input;
-    input.push_back(image);
-    input.push_back(lb);
-    string encoded_str = encoder->Encode(input);
-    if (writer == nullptr) {
-      writer = new BinFileWriter();
-      outfile = output_folder + "/train" +
-                std::to_string(imageid / file_size + 1) + ".bin";
-      writer->Open(outfile, kCreate);
-    }
-    writer->Write(path, encoded_str);
-    if ((imageid + 1) % file_size == 0) {
-      writer->Flush();
-      writer->Close();
-      LOG(INFO) << "Write " << file_size << " images into " << outfile;
-      delete writer;
-      writer = nullptr;
-    }
-  }
-  if (writer != nullptr) {
-    writer->Flush();
-    writer->Close();
-    LOG(INFO) << "Write " << num_train_images % file_size << " images into "
-              << outfile;
-    delete writer;
-    writer = nullptr;
-  }
-  size_t num_file =
-      num_train_images / file_size + ((num_train_images % file_size) ? 1 : 0);
-  LOG(INFO) << "Write " << num_train_images << " images into " << num_file
-            << " binary files";
-  Tensor mean = Tensor(Shape{3, kImageSize, kImageSize}, kUChar);
-  unsigned char *mean_data = new unsigned char[kImageNBytes];
-  for (size_t i = 0; i < kImageNBytes; i++)
-    mean_data[i] = static_cast<unsigned char>(sum[i] / num_train_images);
-  mean.CopyDataFromHostPtr<unsigned char>(mean_data, kImageNBytes);
-  string mean_path = output_folder + "/mean.bin";
-  WriteMean(mean, mean_path);
-  delete[] mean_data;
-  delete[] sum;
-void ILSVRC::CreateTestData(string image_list, string input_folder,
-                            string output_folder) {
-  std::vector<std::pair<string, int>> file_list;
-  string image_file_name;
-  string outfile = output_folder + "/test.bin";
-  int label;
-  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
-  while (image_list_file >> image_file_name >> label)
-    file_list.push_back(std::make_pair(image_file_name, label));
-  LOG(INFO) << "Total number of test images is " << file_list.size();
-  size_t num_test_images = file_list.size();
-  for (size_t imageid = 0; imageid < num_test_images; imageid++) {
-    string path = input_folder + "/" + file_list[imageid].first;
-    Tensor image = ReadImage(path);
-    label = file_list[imageid].second;
-    Tensor lb(Shape{1}, singa::kInt);
-    lb.CopyDataFromHostPtr<int>(&label, 1);
-    std::vector<Tensor> input;
-    input.push_back(image);
-    input.push_back(lb);
-    string encoded_str = encoder->Encode(input);
-    if (writer == nullptr) {
-      writer = new BinFileWriter();
-      writer->Open(outfile, kCreate);
-    }
-    writer->Write(path, encoded_str);
-  }
-  if (writer != nullptr) {
-    writer->Flush();
-    writer->Close();
-    delete writer;
-    writer = nullptr;
-  }
-  LOG(INFO) << "Write " << num_test_images << " images into " << outfile;
-void ILSVRC::ReadMean(string path) {
-  BinFileReader bfreader;
-  string key, value;
-  bfreader.Open(path);
-  bfreader.Read(&key, &value);
-  auto ret = decoder->Decode(value);
-  bfreader.Close();
-  mean = ret[0];
-std::thread ILSVRC::AsyncLoadData(int flag, string file, size_t read_size,
-                                  Tensor *x, Tensor *y, size_t *n_read,
-                                  int nthreads) {
-  return std::thread(
-      [=]() { LoadData(flag, file, read_size, x, y, n_read, nthreads); });
-size_t ILSVRC::LoadData(int flag, string file, size_t read_size, Tensor *x,
-                        Tensor *y, size_t *n_read, int nthreads) {
-  x->Reshape(Shape{read_size, 3, kCropSize, kCropSize});
-  y->AsType(kInt);
-  y->Reshape(Shape{read_size});
-  if (file != last_read_file) {
-    if (reader != nullptr) {
-      reader->Close();
-      delete reader;
-      reader = nullptr;
-    }
-    reader = new BinFileReader();
-    reader->Open(file, 100 << 20);
-    last_read_file = file;
-  } else if (reader == nullptr) {
-    reader = new BinFileReader();
-    reader->Open(file, 100 << 20);
-  }
-  vector<string *> images;
-  for (size_t i = 0; i < read_size; i++) {
-    string image_path;
-    string *image = new string();
-    bool ret = reader->Read(&image_path, image);
-    if (ret == false) {
-      reader->Close();
-      delete reader;
-      reader = nullptr;
-      break;
-    }
-    images.push_back(image);
-  }
-  int nimg = images.size();
-  *n_read = nimg;
-  vector<std::thread> threads;
-  for (int i = 1; i < nthreads; i++) {
-    threads.push_back(AsyncDecodeTransform(flag, i, nthreads, images, x, y));
-  }
-  DecodeTransform(flag, 0, nthreads, images, x, y);
-  for (size_t i = 0; i < threads.size(); i++) threads[i].join();
-  for (int k = 0; k < nimg; k++) delete;
-  return nimg;
-std::thread ILSVRC::AsyncDecodeTransform(int flag, int thid, int nthreads,
-                                         vector<string *> images, Tensor *x,
-                                         Tensor *y) {
-  return std::thread(
-      [=]() { DecodeTransform(flag, thid, nthreads, images, x, y); });
-void ILSVRC::DecodeTransform(int flag, int thid, int nthreads,
-                             vector<string *> images, Tensor *x, Tensor *y) {
-  int nimg = images.size();
-  int start = nimg / nthreads * thid;
-  int end = start + nimg / nthreads;
-  for (int k = start; k < end; k++) {
-    std::vector<Tensor> pair = decoder->Decode(*;
-    auto tmp_image = pair[0] - mean;
-    Tensor aug_image = transformer->Apply(flag, tmp_image);
-    CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
-    CopyDataToFrom(y, pair[1], 1, k);
-  }
-  if (thid == 0) {
-    for (int k = nimg / nthreads * nthreads; k < nimg; k++) {
-      std::vector<Tensor> pair = decoder->Decode(*;
-      auto tmp_image = pair[0] - mean;
-      Tensor aug_image = transformer->Apply(flag, tmp_image);
-      CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
-      CopyDataToFrom(y, pair[1], 1, k);
-    }
-  }
-}  // namespace singa
-#endif  // USE_OPENCV
diff --git a/examples/imagenet/ b/examples/imagenet/
deleted file mode 100755
index 6277d23..0000000
--- a/examples/imagenet/
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env sh
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# See the License for the specific language governing permissions and
-# limitations under the License.
-../../build/bin/imagenet -epoch 90 -lr 0.01 -batchsize 256 -filesize 1280 -ntrain 1281167 -ntest 50000 \
-  -data "imagenet_data" -pfreq 100 -nthreads 12
diff --git a/include/singa/core/tensor.h b/include/singa/core/tensor.h
index 28d1619..c89fa83 100644
--- a/include/singa/core/tensor.h
+++ b/include/singa/core/tensor.h
@@ -458,6 +458,8 @@ void SoftmaxCrossEntropyBwd(const Tensor &t, Tensor *p);
 Tensor CopyRows(const Tensor &in, const size_t start, const size_t end);
 /// Alias of CopyRows
 Tensor SliceRows(const Tensor &in, const size_t start, const size_t end);
+/// Slice the input tensor along the give axis to generate a new tensor
+Tensor SliceOn(const Tensor &in, const size_t start, const size_t end, int axis);
 /// Return a tensor consisting of columns ([start, end)) from 'in'. It copies
 /// the values from 'in'. 'in' is a  2D Tensor.
 Tensor CopyColumns(const Tensor &in, const size_t start, const size_t end);
@@ -466,6 +468,8 @@ Tensor SliceColumns(const Tensor &in, const size_t start, const size_t end);
 /// Return a tensor which is vertically stacked from tensors in 'in'. Each
 /// tensor in 'in' is a 2D tensor. Values are copied, no memory sharing.
 Tensor ConcatenateRows(const vector<Tensor> &in);
+/// Return a tensor concatenated of the input tensors along the give axis.
+Tensor ConcatOn(const vector<Tensor> &in, int axis);
 /// Alias name for function ConcatenateRows
 Tensor ConcatRows(const vector<Tensor> &in);
 /// Return a tensor which is horizontally stacked from tensors in 'in'. Each
diff --git a/python/ b/python/
index 503ac8a..30be063 100644
--- a/python/
+++ b/python/
@@ -61,7 +61,10 @@ setup(
-        'unittest-xml-reporting'
+        'unittest-xml-reporting',
+        'flask>=0.10.1',
+        'flask_cors>=3.0.2',
+        'pillow>=2.3.0'
     #List additional groups of dependencies here (e.g. development
diff --git a/python/singa/ b/python/singa/
index d6e313e..027e78c 100644
--- a/python/singa/
+++ b/python/singa/
@@ -409,7 +409,12 @@ class FeedForwardNet(object):
                 params = pickle.load(fd)
                 for (specs, val) in zip(self.param_specs(),
-                    val.copy_from_numpy(params[])
+                    try:
+                        val.copy_from_numpy(params[])
+                    except AssertionError as err:
+                        print 'Error from copying values for param: %s' %
+                        print 'shape of param vs checkpoint', val.shape, params[].shape
+                        raise err
             print 'NOTE: If your model was saved using pickle, '\
                     'then set use_pickle=True for loading it'
diff --git a/python/singa/ b/python/singa/
index 12d7c53..7dee9f5 100644
--- a/python/singa/
+++ b/python/singa/
@@ -578,9 +578,9 @@ def to_numpy(t):
     th = to_host(t)
     if th.dtype == core_pb2.kFloat32:
-        np_array = ret.singa_tensor.GetFloatValue(int(th.size()))
+        np_array = th.singa_tensor.GetFloatValue(int(th.size()))
     elif th.dtype == core_pb2.kInt:
-        np_array = ret.singa_tensor.GetIntValue(int(th.size()))
+        np_array = th.singa_tensor.GetIntValue(int(th.size()))
         print 'Not implemented yet for ', th.dtype
     return np_array.reshape(th.shape)
diff --git a/src/core/tensor/ b/src/core/tensor/
index d40fd88..ed4da96 100644
--- a/src/core/tensor/
+++ b/src/core/tensor/
@@ -742,6 +742,42 @@ void DivColumn(const Tensor &v, Tensor *M) {
   MultColumn(inv, M);
+Tensor ConcatOn(const vector<Tensor> &in, int axis) {
+  vector<Tensor> tmp;
+  Shape out_shape = in[0].shape();
+  size_t dim = in[0].shape().size();
+  CHECK_GE(dim, 2u) << " Only work for tensor of dim >=2 ";
+  size_t size = in[0].Size() / in[0].shape(axis);
+  size_t new_size = 0u;
+  for (const auto& t: in) {
+    CHECK_EQ(dim, t.shape().size()) << "All tensors should have the same dim";
+    CHECK_EQ(size, t.Size() / t.shape(axis)) << "The size of all axis should "
+      <<" be the same except the concatenated axis";
+    new_size += t.shape(axis);
+  }
+  out_shape[axis] = new_size;
+  if (axis == 0) {
+    size_t nrow = 0;
+    for (const auto& t: in) {
+      nrow += t.shape(0);
+      tmp.push_back(Reshape(t, {t.shape(0), t.Size() / t.shape(0)}));
+    }
+    auto ret = ConcatenateRows(tmp);
+    ret.Reshape(out_shape);
+    return ret;
+  } else {
+    for (const auto& t: in) {
+      size_t nrow = 1;
+      for (int i = 0; i < axis; i++)
+        nrow *= t.shape(i);
+      tmp.push_back(Reshape(t, {nrow, t.Size() / nrow}));
+    }
+    auto ret = ConcatenateColumns(tmp);
+    ret.Reshape(out_shape);
+    return ret;
+  }
 Tensor ConcatenateRows(const vector<Tensor> &in) {
   size_t nrow = 0, ncol = 0;
@@ -805,6 +841,27 @@ Tensor CopyRows(const Tensor &in, const size_t start, const size_t end) {
   return out;
+Tensor SliceOn(const Tensor&in, const size_t start, const size_t end, int axis) {
+  Shape out_shape = in.shape();
+  out_shape[axis] = end - start;
+  if (axis == 0) {
+    auto ret = SliceRows(Reshape(in, {in.shape(0), in.Size() / in.shape(0)}),
+        start, end);
+    ret.Reshape(out_shape);
+    return ret;
+  } else {
+    size_t nrow = 1;
+    for (int i = 0; i < axis; i++)
+      nrow *= in.shape(i);
+    auto suffix = in.Size() / nrow / in.shape(axis);
+    auto ret = SliceColumns(Reshape(in, {nrow, in.Size() / nrow}),
+        start * suffix, end * suffix);
+    ret.Reshape(out_shape);
+    return ret;
+  }
 Tensor SliceRows(const Tensor &in, const size_t start, const size_t end) {
   return CopyRows(in, start, end);
diff --git a/src/io/network/ b/src/io/network/
index e61acdb..f88edbf 100644
--- a/src/io/network/
+++ b/src/io/network/
@@ -227,6 +227,9 @@ void NetworkThread::doWork() {
     LOG(FATAL) << "Bind Error: " << strerror(errno);
+  // TODO(wangwei) remove the hardcode setting, which would result erros if
+  // there are more than 10 connections
+  // reported by yaochang
   if (listen(socket_fd_, 10)) {
     LOG(FATAL) << "Listen Error: " << strerror(errno);
diff --git a/src/model/layer/ b/src/model/layer/
index 88c2409..a94b68e 100644
--- a/src/model/layer/
+++ b/src/model/layer/
@@ -30,14 +30,26 @@ void Concat::Setup(const vector<Shape>& in_shapes, const LayerConf& conf) {
   axis_ = conf.concat_conf().axis();
-  if (axis_ == 0)
-    out_sample_shape_.push_back(in_shapes[0][0]);
-  else {
+  CHECK_GE(axis_, 0);
+  if (axis_ == 0) {
+    out_sample_shape_ = in_shapes[0];
+    size_t fea_size = Product(in_shapes[0]);
+    for (auto& s: in_shapes) {
+      CHECK_EQ(Product(s), fea_size) << "Feature length of all source samples "
+        << "must be the same";
+    }
+  } else {
+    out_sample_shape_ = in_shapes[0];
+    size_t fea_size = Product(in_shapes[0]) / in_shapes[0][axis_ - 1];
     size_t l = 0;
     for (auto& s: in_shapes) {
-       l += s[0];
+       CHECK_GE(s.size(), axis_);
+       l += s[axis_ - 1];
+       CHECK_EQ(fea_size, Product(s) / s[axis_ - 1])
+         << "Feature length for all axis except axis_ must be the same";
-    out_sample_shape_.push_back(l);
+    out_sample_shape_[axis_ - 1] = l;
@@ -52,10 +64,7 @@ const vector<Tensor> Concat::Forward(int flag, const vector<Tensor>& inputs) {
   if (inputs.size() == 1u) {
     outputs = inputs;
   } else {
-    if(axis_ == 0)
-      outputs.push_back(ConcatRows(inputs));
-    else
-      outputs.push_back(ConcatColumns(inputs));
+    outputs.push_back(ConcatOn(inputs, axis_));
   return outputs;
@@ -66,10 +75,7 @@ const std::pair<vector<Tensor>, vector<Tensor>> Concat::Backward(
   CHECK_EQ(grads.size(), 1u) << "Concat layer only have one output tensor.";
   size_t last_offset = 0u;
   for (auto p : slice_point_) {
-    if (axis_ == 0)
-      input_grad.push_back(SliceRows(, last_offset, p));
-    else
-      input_grad.push_back(SliceColumns(, last_offset, p));
+    input_grad.push_back(SliceOn(, last_offset, p, axis_));
     last_offset = p;
   return std::make_pair(input_grad, param_grad);
diff --git a/src/model/layer/concat.h b/src/model/layer/concat.h
index 4e9a967..a759167 100644
--- a/src/model/layer/concat.h
+++ b/src/model/layer/concat.h
@@ -23,6 +23,9 @@
 #include "singa/model/layer.h"
 namespace singa {
+ * Cocnatenate tensors from the source layers along the give axis.
+ */
 class Concat : public Layer {
   /// \copydoc Layer::layer_type()
@@ -43,7 +46,7 @@ class Concat : public Layer {
       const vector<Tensor>& grad) override;
-  /// 0 for concat rows; 1 for concat cols
+  /// >= 0
   int axis_ = 0;
   /// slice_point_[i] the end offset of the i-th source tensor on the concat
   /// axis of the result tensor
diff --git a/src/model/layer/ b/src/model/layer/
index 75dc133..a8f3d3d 100644
--- a/src/model/layer/
+++ b/src/model/layer/
@@ -65,6 +65,8 @@ void Pooling::Setup(const Shape& in_sample, const LayerConf& conf) {
   width_ =;
   pooled_height_ = 1;
   if (pool_conf.ceil()) {
+    // TODO(wangwei): caffe also ensures the last pooling window starts strictly
+    // within the original area
     if (stride_h_ > 0)
       pooled_height_ = static_cast<int>(ceil(static_cast<float>(
               height_ + 2 * pad_h_ - kernel_h_) / stride_h_)) + 1;
diff --git a/src/model/layer/ b/src/model/layer/
index 66c05ee..8a3e4bf 100644
--- a/src/model/layer/
+++ b/src/model/layer/
@@ -30,22 +30,26 @@ void Slice::Setup(const Shape& in_sample, const LayerConf& conf) {
   axis_ = conf.slice_conf().axis();
+  CHECK_GE(axis_, 0u);
   int offset = 0;
   // #slice point = # out tensors - 1
   for (size_t p : conf.slice_conf().slice_point()) {
-    if (axis_ == 1) {
-      out_sample_shapes_.push_back({p - offset});
-      offset = p;
-    } else {
+    if (axis_ == 0) {
+    } else {
+      auto s = in_sample;
+      s[axis_ - 1] = p - offset;
+      out_sample_shapes_.push_back(s);
+      offset = p;
-  slice_point_.push_back(in_sample[0]);
-  if (axis_ == 1) {
-    out_sample_shapes_.push_back({in_sample[0] - offset});
-  } else {
+  if (axis_ == 0) {
+  } else {
+    auto s = in_sample;
+    s[axis_ - 1] = in_sample[axis_ - 1] - offset;
+    out_sample_shapes_.push_back(s);
@@ -54,12 +58,11 @@ const vector<Tensor> Slice::Forward(int flag, const vector<Tensor>& inputs) {
   CHECK_EQ(inputs.size(), 1u) << "Split layer only have one input tensor.";
   size_t offset = 0;
   for (auto& s : slice_point_) {
-    if (axis_ == 0)
-      outputs.push_back(SliceRows(, offset, s));
-    else
-      outputs.push_back(SliceColumns(, offset, s));
+      outputs.push_back(SliceOn(, offset, s, axis_));
     offset = s;
+  outputs.push_back(SliceOn(, offset,,
+        axis_));
   return outputs;
@@ -67,10 +70,7 @@ const std::pair<vector<Tensor>, vector<Tensor>> Slice::Backward(
     int flag, const vector<Tensor>& grads) {
   vector<Tensor> input_grad, param_grad;
   CHECK_EQ(grads.size(), out_sample_shapes_.size());
-  if (axis_ == 0)
-    input_grad.push_back(ConcatRows(grads));
-  else
-    input_grad.push_back(ConcatColumns(grads));
+  input_grad.push_back(ConcatOn(grads, axis_));
   return std::make_pair(input_grad, param_grad);
diff --git a/src/model/layer/slice.h b/src/model/layer/slice.h
index 7ed61fc..536fe64 100644
--- a/src/model/layer/slice.h
+++ b/src/model/layer/slice.h
@@ -23,6 +23,10 @@
 #include "singa/model/layer.h"
 namespace singa {
+ * Slice the tensor from the source layer along the give axis and according to
+ * the give slicep points.
+ */
 class Slice : public Layer {
   /// \copydoc Layer::layer_type()
@@ -44,7 +48,7 @@ class Slice : public Layer {
       const vector<Tensor>& grad) override;
-  /// 0 for slice rows; 1 for slice cols
+  /// >= 0
   int axis_ = 0;
   /// out_sample_shapes_[i] is the shape of the i-th output tensor
   vector<Shape> out_sample_shapes_;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 79e481c..a056631 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,8 +21,8 @@ INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
   ADD_EXECUTABLE(test_ep "singa/")
-  ADD_DEPENDENCIES(test_ep singa_io)
-  TARGET_LINK_LIBRARIES(test_ep singa_utils singa_io protobuf ${SINGA_LINKER_LIBS})
+  ADD_DEPENDENCIES(test_ep singa)
+  TARGET_LINK_LIBRARIES(test_ep singa protobuf ${SINGA_LINKER_LIBS})
 ADD_LIBRARY(gtest STATIC EXCLUDE_FROM_ALL "gtest/gtest.h" "gtest/")

[2/2] incubator-singa git commit: SINGA-295 - Add an example of image classification using GoogleNet

Posted by
SINGA-295 - Add an example of image classification using GoogleNet

Add googlenet; update concat and slice.

Fix the bug from padding layers due to difference of rounding strategies between caffe and cudnn

Move the alexnet files into examples/imagenet/alexnet

Update the prediction file and the readme file according to rafiki's format.
Users can submit mutliple queries via curl.

fixed a bug in to_numpy()

test on cpu
update setup config file to install dependent libs (flask, pillow) when
installing pysinga


Branch: refs/heads/master
Commit: d190fa89aabedb97e1e7af7a7222cadfa5187452
Parents: 8101f00
Author: Wei Wang <>
Authored: Mon Jan 16 20:37:31 2017 +0800
Committer: Wei Wang <>
Committed: Mon Jan 23 19:47:11 2017 +0800

 examples/CMakeLists.txt                  |   8 +-
 examples/imagenet/CMakeLists.txt         |  34 ---
 examples/imagenet/              |  58 ----
 examples/imagenet/             | 402 --------------------------
 examples/imagenet/alexnet/CMakeLists.txt |  34 +++
 examples/imagenet/alexnet/      |  58 ++++
 examples/imagenet/alexnet/     | 402 ++++++++++++++++++++++++++
 examples/imagenet/alexnet/ |  21 ++
 examples/imagenet/alexnet/    |  70 +++++
 examples/imagenet/alexnet/ilsvrc12.h     | 376 ++++++++++++++++++++++++
 examples/imagenet/alexnet/         |  21 ++
 examples/imagenet/         |  21 --
 examples/imagenet/googlenet/    |  66 +++++
 examples/imagenet/googlenet/     | 240 +++++++++++++++
 examples/imagenet/            |  70 -----
 examples/imagenet/ilsvrc12.h             | 376 ------------------------
 examples/imagenet/                 |  21 --
 include/singa/core/tensor.h              |   4 +
 python/                       |   5 +-
 python/singa/                      |   7 +-
 python/singa/                   |   4 +-
 src/core/tensor/                |  57 ++++
 src/io/network/               |   3 +
 src/model/layer/                |  32 +-
 src/model/layer/concat.h                 |   5 +-
 src/model/layer/               |   2 +
 src/model/layer/                 |  32 +-
 src/model/layer/slice.h                  |   6 +-
 test/CMakeLists.txt                      |   4 +-
 29 files changed, 1416 insertions(+), 1023 deletions(-)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0bb6c2f..f372692 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,15 +6,15 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # See the License for the specific language governing permissions and
 # limitations under the License.
diff --git a/examples/imagenet/CMakeLists.txt b/examples/imagenet/CMakeLists.txt
deleted file mode 100644
index fbb7235..0000000
--- a/examples/imagenet/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# See the License for the specific language governing permissions and
-# limitations under the License.
-    ADD_EXECUTABLE(imagenet
-    ADD_DEPENDENCIES(imagenet singa)
-    TARGET_LINK_LIBRARIES(imagenet singa protobuf ${SINGA_LIBKER_LIBS})
-    ADD_EXECUTABLE(createdata
-    ADD_DEPENDENCIES(createdata singa)
-    TARGET_LINK_LIBRARIES(createdata singa protobuf ${SINGA_LIBKER_LIBS})
diff --git a/examples/imagenet/ b/examples/imagenet/
deleted file mode 100644
index be6797c..0000000
--- a/examples/imagenet/
+++ /dev/null
@@ -1,58 +0,0 @@
-# Train AlexNet over ImageNet
-Convolution neural network (CNN) is a type of feed-forward neural
-network widely used for image and video classification. In this example, we will
-use a [deep CNN model](
-to do image classification against the ImageNet dataset.
-## Instructions
-### Compile SINGA
-Please compile SINGA with CUDA, CUDNN and OpenCV. You can manually turn on the
-options in CMakeLists.txt or run `ccmake ..` in build/ folder.
-We have tested CUDNN V4 and V5 (V5 requires CUDA 7.5)
-### Data download
-* Please refer to step1-3 on [Instructions to create ImageNet 2012 data](
-  to download and decompress the data.
-* You can download the training and validation list by
-  [](
-  or from [Imagenet](
-### Data preprocessing
-* Assuming you have downloaded the data and the list.
-  Now we should transform the data into binary files. You can run:
-          sh
-  The script will generate a test file(`test.bin`), a mean file(`mean.bin`) and
-  several training files(`trainX.bin`) in the specified output folder.
-* You can also change the parameters in ``.
-  + `-trainlist <file>`: the file of training list;
-  + `-trainfolder <folder>`: the folder of training images;
-  + `-testlist <file>`: the file of test list;
-  + `-testfolder <floder>`: the folder of test images;
-  + `-outdata <folder>`: the folder to save output files, including mean, training and test files.
-    The script will generate these files in the specified folder;
-  + `-filesize <int>`: number of training images that stores in each binary file.
-### Training
-* After preparing data, you can run the following command to train the Alexnet model.
-          sh
-* You may change the parameters in ``.
-  + `-epoch <int>`: number of epoch to be trained, default is 90;
-  + `-lr <float>`: base learning rate, the learning rate will decrease each 20 epochs,
-    more specifically, `lr = lr * exp(0.1 * (epoch / 20))`;
-  + `-batchsize <int>`: batchsize, it should be changed regarding to your memory;
-  + `-filesize <int>`: number of training images that stores in each binary file, it is the
-    same as the `filesize` in data preprocessing;
-  + `-ntrain <int>`: number of training images;
-  + `-ntest <int>`: number of test images;
-  + `-data <folder>`: the folder which stores the binary files, it is exactly the output
-    folder in data preprocessing step;
-  + `-pfreq <int>`: the frequency(in batch) of printing current model status(loss and accuracy);
-  + `-nthreads <int>`: the number of threads to load data which feed to the model.
diff --git a/examples/imagenet/ b/examples/imagenet/
deleted file mode 100644
index 4ac1130..0000000
--- a/examples/imagenet/
+++ /dev/null
@@ -1,402 +0,0 @@
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-#include "singa/singa_config.h"
-#ifdef USE_OPENCV
-#include <cmath>
-#include "./ilsvrc12.h"
-#include "singa/io/snapshot.h"
-#include "singa/model/feed_forward_net.h"
-#include "singa/model/initializer.h"
-#include "singa/model/metric.h"
-#include "singa/model/optimizer.h"
-#include "singa/utils/channel.h"
-#include "singa/utils/string.h"
-#include "singa/utils/timer.h"
-namespace singa {
-// currently supports 'cudnn' and 'singacpp'
-const std::string engine = "cudnn";
-LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
-                      int pad, float std, float bias = .0f) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_convolution");
-  ConvolutionConf *conv = conf.mutable_convolution_conf();
-  conv->set_num_output(nb_filter);
-  conv->add_kernel_size(kernel);
-  conv->add_stride(stride);
-  conv->add_pad(pad);
-  conv->set_bias_term(true);
-  ParamSpec *wspec = conf.add_param();
-  wspec->set_name(name + "_weight");
-  auto wfill = wspec->mutable_filler();
-  wfill->set_type("Gaussian");
-  wfill->set_std(std);
-  ParamSpec *bspec = conf.add_param();
-  bspec->set_name(name + "_bias");
-  bspec->set_lr_mult(2);
-  bspec->set_decay_mult(0);
-  auto bfill = bspec->mutable_filler();
-  bfill->set_value(bias);
-  return conf;
-LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
-                         int pad) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_pooling");
-  PoolingConf *pool = conf.mutable_pooling_conf();
-  pool->set_kernel_size(kernel);
-  pool->set_stride(stride);
-  pool->set_pad(pad);
-  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
-  return conf;
-LayerConf GenReLUConf(string name) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_relu");
-  return conf;
-LayerConf GenDenseConf(string name, int num_output, float std, float wd,
-                       float bias = .0f) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type("singa_dense");
-  DenseConf *dense = conf.mutable_dense_conf();
-  dense->set_num_output(num_output);
-  ParamSpec *wspec = conf.add_param();
-  wspec->set_name(name + "_weight");
-  wspec->set_decay_mult(wd);
-  auto wfill = wspec->mutable_filler();
-  wfill->set_type("Gaussian");
-  wfill->set_std(std);
-  ParamSpec *bspec = conf.add_param();
-  bspec->set_name(name + "_bias");
-  bspec->set_lr_mult(2);
-  bspec->set_decay_mult(0);
-  auto bfill = bspec->mutable_filler();
-  bfill->set_value(bias);
-  return conf;
-LayerConf GenLRNConf(string name) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_lrn");
-  LRNConf *lrn = conf.mutable_lrn_conf();
-  lrn->set_local_size(5);
-  lrn->set_alpha(1e-04);
-  lrn->set_beta(0.75);
-  return conf;
-LayerConf GenFlattenConf(string name) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type("singa_flatten");
-  return conf;
-LayerConf GenDropoutConf(string name, float dropout_ratio) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_dropout");
-  DropoutConf *dropout = conf.mutable_dropout_conf();
-  dropout->set_dropout_ratio(dropout_ratio);
-  return conf;
-FeedForwardNet CreateNet() {
-  FeedForwardNet net;
-  Shape s{3, 227, 227};
-  net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
-  net.Add(GenReLUConf("relu1"));
-  net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
-  net.Add(GenLRNConf("lrn1"));
-  net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
-  net.Add(GenReLUConf("relu2"));
-  net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
-  net.Add(GenLRNConf("lrn2"));
-  net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
-  net.Add(GenReLUConf("relu3"));
-  net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
-  net.Add(GenReLUConf("relu4"));
-  net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
-  net.Add(GenReLUConf("relu5"));
-  net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
-  net.Add(GenFlattenConf("flat"));
-  net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
-  net.Add(GenReLUConf("relu6"));
-  net.Add(GenDropoutConf("drop6", 0.5));
-  net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
-  net.Add(GenReLUConf("relu7"));
-  net.Add(GenDropoutConf("drop7", 0.5));
-  net.Add(GenDenseConf("ip8", 1000, 0.01, 1));
-  return net;
-void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
-                   std::shared_ptr<Device> device, int epoch, string bin_folder,
-                   size_t num_train_files, size_t batchsize, float lr,
-                   Channel *train_ch, size_t pfreq, int nthreads) {
-  float loss = 0.0f, metric = 0.0f;
-  float load_time = 0.0f, train_time = 0.0f;
-  size_t b = 0;
-  size_t n_read;
-  Timer timer, ttr;
-  Tensor prefetch_x, prefetch_y;
-  string binfile = bin_folder + "/train1.bin";
-  timer.Tick();
-  data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
-                nthreads);
-  load_time += timer.Elapsed();
-  CHECK_EQ(n_read, batchsize);
-  Tensor train_x(prefetch_x.shape(), device);
-  Tensor train_y(prefetch_y.shape(), device, kInt);
-  std::thread th;
-  for (size_t fno = 1; fno <= num_train_files; fno++) {
-    binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
-    while (true) {
-      if (th.joinable()) {
-        th.join();
-        load_time += timer.Elapsed();
-        // LOG(INFO) << "num of samples: " << n_read;
-        if (n_read < batchsize) {
-          if (n_read > 0) {
-            LOG(WARNING) << "Pls set batchsize to make num_total_samples "
-                         << "% batchsize == 0. Otherwise, the last " << n_read
-                         << " samples would not be used";
-          }
-          break;
-        }
-      }
-      if (n_read == batchsize) {
-        train_x.CopyData(prefetch_x);
-        train_y.CopyData(prefetch_y);
-      }
-      timer.Tick();
-      th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
-                              &prefetch_y, &n_read, nthreads);
-      if (n_read < batchsize) continue;
-      CHECK_EQ(train_x.shape(0), train_y.shape(0));
-      ttr.Tick();
-      auto ret = net.TrainOnBatch(epoch, train_x, train_y);
-      train_time += ttr.Elapsed();
-      loss += ret.first;
-      metric += ret.second;
-      b++;
-    }
-    if (b % pfreq == 0) {
-      train_ch->Send(
-          "Epoch " + std::to_string(epoch) + ", training loss = " +
-          std::to_string(loss / b) + ", accuracy = " +
-          std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
-          ", time of loading " + std::to_string(batchsize) + " images = " +
-          std::to_string(load_time / b) +
-          " ms, time of training (batchsize = " + std::to_string(batchsize) +
-          ") = " + std::to_string(train_time / b) + " ms.");
-      loss = 0.0f;
-      metric = 0.0f;
-      load_time = 0.0f;
-      train_time = 0.0f;
-      b = 0;
-    }
-  }
-void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
-                  std::shared_ptr<Device> device, int epoch, string bin_folder,
-                  size_t num_test_images, size_t batchsize, Channel *val_ch,
-                  int nthreads) {
-  float loss = 0.0f, metric = 0.0f;
-  float load_time = 0.0f, eval_time = 0.0f;
-  size_t n_read;
-  string binfile = bin_folder + "/test.bin";
-  Timer timer, tte;
-  Tensor prefetch_x, prefetch_y;
-  timer.Tick();
-  data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
-                nthreads);
-  load_time += timer.Elapsed();
-  Tensor test_x(prefetch_x.shape(), device);
-  Tensor test_y(prefetch_y.shape(), device, kInt);
-  int remain = (int)num_test_images - n_read;
-  CHECK_EQ(n_read, batchsize);
-  std::thread th;
-  while (true) {
-    if (th.joinable()) {
-      th.join();
-      load_time += timer.Elapsed();
-      remain -= n_read;
-      if (remain < 0) break;
-      if (n_read < batchsize) break;
-    }
-    test_x.CopyData(prefetch_x);
-    test_y.CopyData(prefetch_y);
-    timer.Tick();
-    th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
-                            &n_read, nthreads);
-    CHECK_EQ(test_x.shape(0), test_y.shape(0));
-    tte.Tick();
-    auto ret = net.EvaluateOnBatch(test_x, test_y);
-    eval_time += tte.Elapsed();
-    ret.first.ToHost();
-    ret.second.ToHost();
-    loss += Sum(ret.first);
-    metric += Sum(ret.second);
-  }
-  loss /= num_test_images;
-  metric /= num_test_images;
-  val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
-               std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
-               ", time of loading " + std::to_string(num_test_images) +
-               " images = " + std::to_string(load_time) +
-               " ms, time of evaluating " + std::to_string(num_test_images) +
-               " images = " + std::to_string(eval_time) + " ms.");
-void Checkpoint(FeedForwardNet &net, string prefix) {
-  Snapshot snapshot(prefix, Snapshot::kWrite, 200);
-  auto names = net.GetParamNames();
-  auto values = net.GetParamValues();
-  for (size_t k = 0; k < names.size(); k++) {
-    snapshot.Write(,;
-  }
-  LOG(INFO) << "Write snapshot into " << prefix;
-void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
-           string bin_folder, size_t num_train_images, size_t num_test_images,
-           size_t pfreq, int nthreads) {
-  ILSVRC data;
-  data.ReadMean(bin_folder + "/mean.bin");
-  auto net = CreateNet();
-  auto cuda = std::make_shared<CudaGPU>(0);
-  net.ToDevice(cuda);
-  SGD sgd;
-  OptimizerConf opt_conf;
-  opt_conf.set_momentum(0.9);
-  auto reg = opt_conf.mutable_regularizer();
-  reg->set_coefficient(0.0005);
-  sgd.Setup(opt_conf);
-  sgd.SetLearningRateGenerator(
-      [lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });
-  SoftmaxCrossEntropy loss;
-  Accuracy acc;
-  net.Compile(true, &sgd, &loss, &acc);
-  Channel *train_ch = GetChannel("train_perf");
-  train_ch->EnableDestStderr(true);
-  Channel *val_ch = GetChannel("val_perf");
-  val_ch->EnableDestStderr(true);
-  size_t num_train_files = num_train_images / train_file_size +
-                           (num_train_images % train_file_size ? 1 : 0);
-  for (int epoch = 0; epoch < num_epoch; epoch++) {
-    float epoch_lr = sgd.GetLearningRate(epoch);
-    TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
-                  batchsize, epoch_lr, train_ch, pfreq, nthreads);
-    if (epoch % 10 == 0 && epoch > 0) {
-      string prefix = "snapshot_epoch" + std::to_string(epoch);
-      Checkpoint(net, prefix);
-    }
-    TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
-                 val_ch, nthreads);
-  }
-int main(int argc, char **argv) {
-  singa::InitChannel(nullptr);
-  int pos = singa::ArgPos(argc, argv, "-h");
-  if (pos != -1) {
-    std::cout << "Usage:\n"
-              << "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
-              << "\t-lr <float>: base learning rate;\n"
-              << "\t-batchsize <int>: batchsize, it should be changed regarding "
-                 "to your memory;\n"
-              << "\t-filesize <int>: number of training images that stores in "
-                 "each binary file;\n"
-              << "\t-ntrain <int>: number of training images;\n"
-              << "\t-ntest <int>: number of test images;\n"
-              << "\t-data <folder>: the folder which stores the binary files;\n"
-              << "\t-pfreq <int>: the frequency(in batch) of printing current "
-                 "model status(loss and accuracy);\n"
-              << "\t-nthreads <int>`: the number of threads to load data which "
-                 "feed to the model.\n";
-    return 0;
-  }
-  pos = singa::ArgPos(argc, argv, "-epoch");
-  int nEpoch = 90;
-  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-lr");
-  float lr = 0.01;
-  if (pos != -1) lr = atof(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-batchsize");
-  int batchsize = 256;
-  if (pos != -1) batchsize = atof(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-filesize");
-  size_t train_file_size = 1280;
-  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-ntrain");
-  size_t num_train_images = 1281167;
-  if (pos != -1) num_train_images = atoi(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-ntest");
-  size_t num_test_images = 50000;
-  if (pos != -1) num_test_images = atoi(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-data");
-  string bin_folder = "imagenet_data";
-  if (pos != -1) bin_folder = argv[pos + 1];
-  pos = singa::ArgPos(argc, argv, "-pfreq");
-  size_t pfreq = 100;
-  if (pos != -1) pfreq = atoi(argv[pos + 1]);
-  pos = singa::ArgPos(argc, argv, "-nthreads");
-  int nthreads = 12;
-  if (pos != -1) nthreads = atoi(argv[pos + 1]);
-  LOG(INFO) << "Start training";
-  singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
-               num_train_images, num_test_images, pfreq, nthreads);
-  LOG(INFO) << "End training";
diff --git a/examples/imagenet/alexnet/CMakeLists.txt b/examples/imagenet/alexnet/CMakeLists.txt
new file mode 100644
index 0000000..fbb7235
--- /dev/null
+++ b/examples/imagenet/alexnet/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions and
+# limitations under the License.
+    ADD_EXECUTABLE(imagenet
+    ADD_DEPENDENCIES(imagenet singa)
+    TARGET_LINK_LIBRARIES(imagenet singa protobuf ${SINGA_LIBKER_LIBS})
+    ADD_EXECUTABLE(createdata
+    ADD_DEPENDENCIES(createdata singa)
+    TARGET_LINK_LIBRARIES(createdata singa protobuf ${SINGA_LIBKER_LIBS})
diff --git a/examples/imagenet/alexnet/ b/examples/imagenet/alexnet/
new file mode 100644
index 0000000..be6797c
--- /dev/null
+++ b/examples/imagenet/alexnet/
@@ -0,0 +1,58 @@
+# Train AlexNet over ImageNet
+Convolution neural network (CNN) is a type of feed-forward neural
+network widely used for image and video classification. In this example, we will
+use a [deep CNN model](
+to do image classification against the ImageNet dataset.
+## Instructions
+### Compile SINGA
+Please compile SINGA with CUDA, CUDNN and OpenCV. You can manually turn on the
+options in CMakeLists.txt or run `ccmake ..` in build/ folder.
+We have tested CUDNN V4 and V5 (V5 requires CUDA 7.5)
+### Data download
+* Please refer to step1-3 on [Instructions to create ImageNet 2012 data](
+  to download and decompress the data.
+* You can download the training and validation list by
+  [](
+  or from [Imagenet](
+### Data preprocessing
+* Assuming you have downloaded the data and the list.
+  Now we should transform the data into binary files. You can run:
+          sh
+  The script will generate a test file(`test.bin`), a mean file(`mean.bin`) and
+  several training files(`trainX.bin`) in the specified output folder.
+* You can also change the parameters in ``.
+  + `-trainlist <file>`: the file of training list;
+  + `-trainfolder <folder>`: the folder of training images;
+  + `-testlist <file>`: the file of test list;
+  + `-testfolder <floder>`: the folder of test images;
+  + `-outdata <folder>`: the folder to save output files, including mean, training and test files.
+    The script will generate these files in the specified folder;
+  + `-filesize <int>`: number of training images that stores in each binary file.
+### Training
+* After preparing data, you can run the following command to train the Alexnet model.
+          sh
+* You may change the parameters in ``.
+  + `-epoch <int>`: number of epoch to be trained, default is 90;
+  + `-lr <float>`: base learning rate, the learning rate will decrease each 20 epochs,
+    more specifically, `lr = lr * exp(0.1 * (epoch / 20))`;
+  + `-batchsize <int>`: batchsize, it should be changed regarding to your memory;
+  + `-filesize <int>`: number of training images that stores in each binary file, it is the
+    same as the `filesize` in data preprocessing;
+  + `-ntrain <int>`: number of training images;
+  + `-ntest <int>`: number of test images;
+  + `-data <folder>`: the folder which stores the binary files, it is exactly the output
+    folder in data preprocessing step;
+  + `-pfreq <int>`: the frequency(in batch) of printing current model status(loss and accuracy);
+  + `-nthreads <int>`: the number of threads to load data which feed to the model.
diff --git a/examples/imagenet/alexnet/ b/examples/imagenet/alexnet/
new file mode 100644
index 0000000..4ac1130
--- /dev/null
+++ b/examples/imagenet/alexnet/
@@ -0,0 +1,402 @@
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include <cmath>
+#include "./ilsvrc12.h"
+#include "singa/io/snapshot.h"
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/initializer.h"
+#include "singa/model/metric.h"
+#include "singa/model/optimizer.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+#include "singa/utils/timer.h"
+namespace singa {
+// currently supports 'cudnn' and 'singacpp'
+const std::string engine = "cudnn";
+LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
+                      int pad, float std, float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_convolution");
+  ConvolutionConf *conv = conf.mutable_convolution_conf();
+  conv->set_num_output(nb_filter);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
+  conv->set_bias_term(true);
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+  return conf;
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_pooling");
+  PoolingConf *pool = conf.mutable_pooling_conf();
+  pool->set_kernel_size(kernel);
+  pool->set_stride(stride);
+  pool->set_pad(pad);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
+  return conf;
+LayerConf GenReLUConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_relu");
+  return conf;
+LayerConf GenDenseConf(string name, int num_output, float std, float wd,
+                       float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_dense");
+  DenseConf *dense = conf.mutable_dense_conf();
+  dense->set_num_output(num_output);
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+  return conf;
+LayerConf GenLRNConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_lrn");
+  LRNConf *lrn = conf.mutable_lrn_conf();
+  lrn->set_local_size(5);
+  lrn->set_alpha(1e-04);
+  lrn->set_beta(0.75);
+  return conf;
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_flatten");
+  return conf;
+LayerConf GenDropoutConf(string name, float dropout_ratio) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_dropout");
+  DropoutConf *dropout = conf.mutable_dropout_conf();
+  dropout->set_dropout_ratio(dropout_ratio);
+  return conf;
+FeedForwardNet CreateNet() {
+  FeedForwardNet net;
+  Shape s{3, 227, 227};
+  net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
+  net.Add(GenReLUConf("relu4"));
+  net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
+  net.Add(GenReLUConf("relu5"));
+  net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
+  net.Add(GenReLUConf("relu6"));
+  net.Add(GenDropoutConf("drop6", 0.5));
+  net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
+  net.Add(GenReLUConf("relu7"));
+  net.Add(GenDropoutConf("drop7", 0.5));
+  net.Add(GenDenseConf("ip8", 1000, 0.01, 1));
+  return net;
+void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
+                   std::shared_ptr<Device> device, int epoch, string bin_folder,
+                   size_t num_train_files, size_t batchsize, float lr,
+                   Channel *train_ch, size_t pfreq, int nthreads) {
+  float loss = 0.0f, metric = 0.0f;
+  float load_time = 0.0f, train_time = 0.0f;
+  size_t b = 0;
+  size_t n_read;
+  Timer timer, ttr;
+  Tensor prefetch_x, prefetch_y;
+  string binfile = bin_folder + "/train1.bin";
+  timer.Tick();
+  data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
+                nthreads);
+  load_time += timer.Elapsed();
+  CHECK_EQ(n_read, batchsize);
+  Tensor train_x(prefetch_x.shape(), device);
+  Tensor train_y(prefetch_y.shape(), device, kInt);
+  std::thread th;
+  for (size_t fno = 1; fno <= num_train_files; fno++) {
+    binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
+    while (true) {
+      if (th.joinable()) {
+        th.join();
+        load_time += timer.Elapsed();
+        // LOG(INFO) << "num of samples: " << n_read;
+        if (n_read < batchsize) {
+          if (n_read > 0) {
+            LOG(WARNING) << "Pls set batchsize to make num_total_samples "
+                         << "% batchsize == 0. Otherwise, the last " << n_read
+                         << " samples would not be used";
+          }
+          break;
+        }
+      }
+      if (n_read == batchsize) {
+        train_x.CopyData(prefetch_x);
+        train_y.CopyData(prefetch_y);
+      }
+      timer.Tick();
+      th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
+                              &prefetch_y, &n_read, nthreads);
+      if (n_read < batchsize) continue;
+      CHECK_EQ(train_x.shape(0), train_y.shape(0));
+      ttr.Tick();
+      auto ret = net.TrainOnBatch(epoch, train_x, train_y);
+      train_time += ttr.Elapsed();
+      loss += ret.first;
+      metric += ret.second;
+      b++;
+    }
+    if (b % pfreq == 0) {
+      train_ch->Send(
+          "Epoch " + std::to_string(epoch) + ", training loss = " +
+          std::to_string(loss / b) + ", accuracy = " +
+          std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
+          ", time of loading " + std::to_string(batchsize) + " images = " +
+          std::to_string(load_time / b) +
+          " ms, time of training (batchsize = " + std::to_string(batchsize) +
+          ") = " + std::to_string(train_time / b) + " ms.");
+      loss = 0.0f;
+      metric = 0.0f;
+      load_time = 0.0f;
+      train_time = 0.0f;
+      b = 0;
+    }
+  }
+void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
+                  std::shared_ptr<Device> device, int epoch, string bin_folder,
+                  size_t num_test_images, size_t batchsize, Channel *val_ch,
+                  int nthreads) {
+  float loss = 0.0f, metric = 0.0f;
+  float load_time = 0.0f, eval_time = 0.0f;
+  size_t n_read;
+  string binfile = bin_folder + "/test.bin";
+  Timer timer, tte;
+  Tensor prefetch_x, prefetch_y;
+  timer.Tick();
+  data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
+                nthreads);
+  load_time += timer.Elapsed();
+  Tensor test_x(prefetch_x.shape(), device);
+  Tensor test_y(prefetch_y.shape(), device, kInt);
+  int remain = (int)num_test_images - n_read;
+  CHECK_EQ(n_read, batchsize);
+  std::thread th;
+  while (true) {
+    if (th.joinable()) {
+      th.join();
+      load_time += timer.Elapsed();
+      remain -= n_read;
+      if (remain < 0) break;
+      if (n_read < batchsize) break;
+    }
+    test_x.CopyData(prefetch_x);
+    test_y.CopyData(prefetch_y);
+    timer.Tick();
+    th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
+                            &n_read, nthreads);
+    CHECK_EQ(test_x.shape(0), test_y.shape(0));
+    tte.Tick();
+    auto ret = net.EvaluateOnBatch(test_x, test_y);
+    eval_time += tte.Elapsed();
+    ret.first.ToHost();
+    ret.second.ToHost();
+    loss += Sum(ret.first);
+    metric += Sum(ret.second);
+  }
+  loss /= num_test_images;
+  metric /= num_test_images;
+  val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
+               std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
+               ", time of loading " + std::to_string(num_test_images) +
+               " images = " + std::to_string(load_time) +
+               " ms, time of evaluating " + std::to_string(num_test_images) +
+               " images = " + std::to_string(eval_time) + " ms.");
+void Checkpoint(FeedForwardNet &net, string prefix) {
+  Snapshot snapshot(prefix, Snapshot::kWrite, 200);
+  auto names = net.GetParamNames();
+  auto values = net.GetParamValues();
+  for (size_t k = 0; k < names.size(); k++) {
+    snapshot.Write(,;
+  }
+  LOG(INFO) << "Write snapshot into " << prefix;
+void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
+           string bin_folder, size_t num_train_images, size_t num_test_images,
+           size_t pfreq, int nthreads) {
+  ILSVRC data;
+  data.ReadMean(bin_folder + "/mean.bin");
+  auto net = CreateNet();
+  auto cuda = std::make_shared<CudaGPU>(0);
+  net.ToDevice(cuda);
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.0005);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator(
+      [lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });
+  SoftmaxCrossEntropy loss;
+  Accuracy acc;
+  net.Compile(true, &sgd, &loss, &acc);
+  Channel *train_ch = GetChannel("train_perf");
+  train_ch->EnableDestStderr(true);
+  Channel *val_ch = GetChannel("val_perf");
+  val_ch->EnableDestStderr(true);
+  size_t num_train_files = num_train_images / train_file_size +
+                           (num_train_images % train_file_size ? 1 : 0);
+  for (int epoch = 0; epoch < num_epoch; epoch++) {
+    float epoch_lr = sgd.GetLearningRate(epoch);
+    TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
+                  batchsize, epoch_lr, train_ch, pfreq, nthreads);
+    if (epoch % 10 == 0 && epoch > 0) {
+      string prefix = "snapshot_epoch" + std::to_string(epoch);
+      Checkpoint(net, prefix);
+    }
+    TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
+                 val_ch, nthreads);
+  }
+int main(int argc, char **argv) {
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-h");
+  if (pos != -1) {
+    std::cout << "Usage:\n"
+              << "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
+              << "\t-lr <float>: base learning rate;\n"
+              << "\t-batchsize <int>: batchsize, it should be changed regarding "
+                 "to your memory;\n"
+              << "\t-filesize <int>: number of training images that stores in "
+                 "each binary file;\n"
+              << "\t-ntrain <int>: number of training images;\n"
+              << "\t-ntest <int>: number of test images;\n"
+              << "\t-data <folder>: the folder which stores the binary files;\n"
+              << "\t-pfreq <int>: the frequency(in batch) of printing current "
+                 "model status(loss and accuracy);\n"
+              << "\t-nthreads <int>`: the number of threads to load data which "
+                 "feed to the model.\n";
+    return 0;
+  }
+  pos = singa::ArgPos(argc, argv, "-epoch");
+  int nEpoch = 90;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-lr");
+  float lr = 0.01;
+  if (pos != -1) lr = atof(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-batchsize");
+  int batchsize = 256;
+  if (pos != -1) batchsize = atof(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-filesize");
+  size_t train_file_size = 1280;
+  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-ntrain");
+  size_t num_train_images = 1281167;
+  if (pos != -1) num_train_images = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-ntest");
+  size_t num_test_images = 50000;
+  if (pos != -1) num_test_images = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-data");
+  string bin_folder = "imagenet_data";
+  if (pos != -1) bin_folder = argv[pos + 1];
+  pos = singa::ArgPos(argc, argv, "-pfreq");
+  size_t pfreq = 100;
+  if (pos != -1) pfreq = atoi(argv[pos + 1]);
+  pos = singa::ArgPos(argc, argv, "-nthreads");
+  int nthreads = 12;
+  if (pos != -1) nthreads = atoi(argv[pos + 1]);
+  LOG(INFO) << "Start training";
+  singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
+               num_train_images, num_test_images, pfreq, nthreads);
+  LOG(INFO) << "End training";
diff --git a/examples/imagenet/alexnet/ b/examples/imagenet/alexnet/
new file mode 100755
index 0000000..4c2c034
--- /dev/null
+++ b/examples/imagenet/alexnet/
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions and
+# limitations under the License.
+../../build/bin/createdata -trainlist "imagenet/label/train.txt" -trainfolder "imagenet/ILSVRC2012_img_train" \
+  -testlist "imagenet/label/val.txt" -testfolder "imagenet/ILSVRC2012_img_val" -outdata "imagenet_data" -filesize 1280
diff --git a/examples/imagenet/alexnet/ b/examples/imagenet/alexnet/
new file mode 100644
index 0000000..c9e6d2f
--- /dev/null
+++ b/examples/imagenet/alexnet/
@@ -0,0 +1,70 @@
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include "ilsvrc12.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+int main(int argc, char **argv) {
+  int pos = singa::ArgPos(argc, argv, "-h");
+  if (pos != -1) {
+    std::cout << "Usage:\n"
+              << "\t-trainlist <file>: the file of training list;\n"
+              << "\t-trainfolder <folder>: the folder of training images;\n"
+              << "\t-testlist <file>: the file of test list;\n"
+              << "\t-testfolder <floder>: the folder of test images;\n"
+              << "\t-outdata <folder>: the folder to save output files;\n"
+              << "\t-filesize <int>: number of training images that stores in "
+                 "each binary file.\n";
+    return 0;
+  }
+  pos = singa::ArgPos(argc, argv, "-trainlist");
+  string train_image_list = "imagenet/label/train.txt";
+  if (pos != -1) train_image_list = argv[pos + 1];
+  pos = singa::ArgPos(argc, argv, "-trainfolder");
+  string train_image_folder = "imagenet/ILSVRC2012_img_train";
+  if (pos != -1) train_image_folder = argv[pos + 1];
+  pos = singa::ArgPos(argc, argv, "-testlist");
+  string test_image_list = "imagenet/label/val.txt";
+  if (pos != -1) test_image_list = argv[pos + 1];
+  pos = singa::ArgPos(argc, argv, "-testfolder");
+  string test_image_folder = "imagenet/ILSVRC2012_img_val";
+  if (pos != -1) test_image_folder = argv[pos + 1];
+  pos = singa::ArgPos(argc, argv, "-outdata");
+  string bin_folder = "imagenet_data";
+  if (pos != -1) bin_folder = argv[pos + 1];
+  pos = singa::ArgPos(argc, argv, "-filesize");
+  size_t train_file_size = 1280;
+  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
+  singa::ILSVRC data;
+  LOG(INFO) << "Creating training and test data...";
+  data.CreateTrainData(train_image_list, train_image_folder, bin_folder,
+                       train_file_size);
+  data.CreateTestData(test_image_list, test_image_folder, bin_folder);
+  LOG(INFO) << "Data created!";
+  return 0;
+#endif  // USE_OPENCV
diff --git a/examples/imagenet/alexnet/ilsvrc12.h b/examples/imagenet/alexnet/ilsvrc12.h
new file mode 100644
index 0000000..74fffbb
--- /dev/null
+++ b/examples/imagenet/alexnet/ilsvrc12.h
@@ -0,0 +1,376 @@
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include <omp.h>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <thread>
+#include <vector>
+#include "singa/core/tensor.h"
+#include "singa/io/decoder.h"
+#include "singa/io/encoder.h"
+#include "singa/io/reader.h"
+#include "singa/io/transformer.h"
+#include "singa/io/writer.h"
+#include "singa/proto/io.pb.h"
+#include "singa/utils/timer.h"
+using std::string;
+using namespace singa::io;
+namespace singa {
+/// For reading ILSVRC2012 image data as tensors.
+class ILSVRC {
+ public:
+  /// Setup encoder, decoder
+  ILSVRC();
+  ~ILSVRC() {
+    if (encoder != nullptr) delete encoder;
+    if (decoder != nullptr) delete decoder;
+    if (transformer != nullptr) delete transformer;
+    if (reader != nullptr) {
+      reader->Close();
+      delete reader;
+    }
+    if (writer != nullptr) {
+      writer->Close();
+      delete writer;
+    }
+  }
+  /// Create binary files for training data
+  /// train_image_list: list file of training images
+  /// train_image_folder: folder where stores original training images
+  /// train_bin_folder: folder to store binary files
+  /// train_file_size: number of images that are contain in one binary file
+  void CreateTrainData(string train_image_list, string train_image_folder,
+                       string train_bin_folder, size_t train_file_size);
+  /// Create binary files for test data
+  /// train_image_list: list file of test images
+  /// train_image_folder: folder where saves original test images
+  /// train_bin_folder: folder to save binary files
+  void CreateTestData(string test_image_list, string test_image_folder,
+                      string test_bin_folder);
+  /// Load data from a binary file,  return <images, labels> pair
+  /// suppose the data will be loaded file by file.
+  /// flag: kTrain or kTest
+  /// file: binary file which stores the images
+  /// read_size: number of images to be loaded
+  /// offset: offset in the file
+  /// n_read: number of images which are read
+  size_t LoadData(int flag, string file, size_t read_size, Tensor *x, Tensor *y,
+                  size_t *n_read, int nthreads);
+  /// A wrapper method to spawn a thread to execute LoadData() method.
+  std::thread AsyncLoadData(int flag, string file, size_t read_size, Tensor *x,
+                            Tensor *y, size_t *n_read, int nthreads);
+  void DecodeTransform(int flag, int thid, int nthreads,
+                       vector<string *> images, Tensor *x, Tensor *y);
+  /// A wrapper method to spawn a thread to execute Decodetransform() method.
+  std::thread AsyncDecodeTransform(int flag, int thid, int nthreads,
+                                   vector<string *> images, Tensor *x,
+                                   Tensor *y);
+  /// Read mean from path
+  void ReadMean(string path);
+ protected:
+  /// Read one image at path, resize the image
+  Tensor ReadImage(string path);
+  /// Write buff to the file in kCreate/kAppend mode
+  void Write(string outfile, singa::io::Mode mode);
+  void WriteMean(Tensor &mean, string path);
+ private:
+  /// size for resizing
+  const size_t kImageSize = 256;
+  const size_t kImageNBytes = 3 * kImageSize * kImageSize;
+  /// size for cropping
+  const size_t kCropSize = 227;
+  Tensor mean;
+  string last_read_file = "";
+  JPGEncoder *encoder = nullptr;
+  JPGDecoder *decoder = nullptr;
+  ImageTransformer *transformer = nullptr;
+  BinFileReader *reader = nullptr;
+  BinFileWriter *writer = nullptr;
+  EncoderConf en_conf;
+  en_conf.set_image_dim_order("CHW");
+  encoder = new JPGEncoder();
+  encoder->Setup(en_conf);
+  DecoderConf de_conf;
+  de_conf.set_image_dim_order("CHW");
+  decoder = new JPGDecoder();
+  decoder->Setup(de_conf);
+  TransformerConf trans_conf;
+  trans_conf.add_crop_shape(kCropSize);
+  trans_conf.add_crop_shape(kCropSize);
+  trans_conf.set_image_dim_order("CHW");
+  trans_conf.set_horizontal_mirror(true);
+  transformer = new ImageTransformer();
+  transformer->Setup(trans_conf);
+Tensor ILSVRC::ReadImage(string path) {
+  cv::Mat mat = cv::imread(path, CV_LOAD_IMAGE_COLOR);
+  CHECK( != NULL) << "OpenCV load image fail: " << path;
+  cv::Size size(kImageSize, kImageSize);
+  cv::Mat resized;
+  cv::resize(mat, resized, size);
+  CHECK_EQ((size_t)resized.size().height, kImageSize);
+  CHECK_EQ((size_t)resized.size().width, kImageSize);
+  // dimension_order: CHW
+  Shape shape{(size_t)resized.channels(), (size_t)resized.rows,
+              (size_t)resized.cols};
+  Tensor image(shape, singa::kUChar);
+  unsigned char *data = new unsigned char[kImageNBytes];
+  for (int i = 0; i < resized.rows; i++)
+    for (int j = 0; j < resized.cols; j++)
+      for (int k = 0; k < resized.channels(); k++)
+        data[k * kImageSize * kImageSize + i * kImageSize + j] =
+  <cv::Vec3b>(i, j)[k];
+  image.CopyDataFromHostPtr<unsigned char>(data, kImageNBytes);
+  delete[] data;
+  return image;
+void ILSVRC::WriteMean(Tensor &mean, string path) {
+  Tensor mean_lb(Shape{1}, kInt);
+  std::vector<Tensor> input;
+  input.push_back(mean);
+  input.push_back(mean_lb);
+  BinFileWriter bfwriter;
+  bfwriter.Open(path, kCreate);
+  bfwriter.Write(path, encoder->Encode(input));
+  bfwriter.Flush();
+  bfwriter.Close();
+void ILSVRC::CreateTrainData(string image_list, string input_folder,
+                             string output_folder, size_t file_size = 12800) {
+  std::vector<std::pair<string, int>> file_list;
+  size_t *sum = new size_t[kImageNBytes];
+  for (size_t i = 0; i < kImageNBytes; i++) sum[i] = 0u;
+  string image_file_name;
+  int label;
+  string outfile;
+  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
+  while (image_list_file >> image_file_name >> label)
+    file_list.push_back(std::make_pair(image_file_name, label));
+  LOG(INFO) << "Data Shuffling";
+  std::shuffle(file_list.begin(), file_list.end(),
+               std::default_random_engine());
+  LOG(INFO) << "Total number of training images is " << file_list.size();
+  size_t num_train_images = file_list.size();
+  if (file_size == 0) file_size = num_train_images;
+  for (size_t imageid = 0; imageid < num_train_images; imageid++) {
+    string path = input_folder + "/" + file_list[imageid].first;
+    Tensor image = ReadImage(path);
+    auto image_data =<unsigned char>();
+    for (size_t i = 0; i < kImageNBytes; i++)
+      sum[i] += static_cast<size_t>(image_data[i]);
+    label = file_list[imageid].second;
+    Tensor lb(Shape{1}, kInt);
+    lb.CopyDataFromHostPtr<int>(&label, 1);
+    std::vector<Tensor> input;
+    input.push_back(image);
+    input.push_back(lb);
+    string encoded_str = encoder->Encode(input);
+    if (writer == nullptr) {
+      writer = new BinFileWriter();
+      outfile = output_folder + "/train" +
+                std::to_string(imageid / file_size + 1) + ".bin";
+      writer->Open(outfile, kCreate);
+    }
+    writer->Write(path, encoded_str);
+    if ((imageid + 1) % file_size == 0) {
+      writer->Flush();
+      writer->Close();
+      LOG(INFO) << "Write " << file_size << " images into " << outfile;
+      delete writer;
+      writer = nullptr;
+    }
+  }
+  if (writer != nullptr) {
+    writer->Flush();
+    writer->Close();
+    LOG(INFO) << "Write " << num_train_images % file_size << " images into "
+              << outfile;
+    delete writer;
+    writer = nullptr;
+  }
+  size_t num_file =
+      num_train_images / file_size + ((num_train_images % file_size) ? 1 : 0);
+  LOG(INFO) << "Write " << num_train_images << " images into " << num_file
+            << " binary files";
+  Tensor mean = Tensor(Shape{3, kImageSize, kImageSize}, kUChar);
+  unsigned char *mean_data = new unsigned char[kImageNBytes];
+  for (size_t i = 0; i < kImageNBytes; i++)
+    mean_data[i] = static_cast<unsigned char>(sum[i] / num_train_images);
+  mean.CopyDataFromHostPtr<unsigned char>(mean_data, kImageNBytes);
+  string mean_path = output_folder + "/mean.bin";
+  WriteMean(mean, mean_path);
+  delete[] mean_data;
+  delete[] sum;
+void ILSVRC::CreateTestData(string image_list, string input_folder,
+                            string output_folder) {
+  std::vector<std::pair<string, int>> file_list;
+  string image_file_name;
+  string outfile = output_folder + "/test.bin";
+  int label;
+  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
+  while (image_list_file >> image_file_name >> label)
+    file_list.push_back(std::make_pair(image_file_name, label));
+  LOG(INFO) << "Total number of test images is " << file_list.size();
+  size_t num_test_images = file_list.size();
+  for (size_t imageid = 0; imageid < num_test_images; imageid++) {
+    string path = input_folder + "/" + file_list[imageid].first;
+    Tensor image = ReadImage(path);
+    label = file_list[imageid].second;
+    Tensor lb(Shape{1}, singa::kInt);
+    lb.CopyDataFromHostPtr<int>(&label, 1);
+    std::vector<Tensor> input;
+    input.push_back(image);
+    input.push_back(lb);
+    string encoded_str = encoder->Encode(input);
+    if (writer == nullptr) {
+      writer = new BinFileWriter();
+      writer->Open(outfile, kCreate);
+    }
+    writer->Write(path, encoded_str);
+  }
+  if (writer != nullptr) {
+    writer->Flush();
+    writer->Close();
+    delete writer;
+    writer = nullptr;
+  }
+  LOG(INFO) << "Write " << num_test_images << " images into " << outfile;
+void ILSVRC::ReadMean(string path) {
+  BinFileReader bfreader;
+  string key, value;
+  bfreader.Open(path);
+  bfreader.Read(&key, &value);
+  auto ret = decoder->Decode(value);
+  bfreader.Close();
+  mean = ret[0];
+std::thread ILSVRC::AsyncLoadData(int flag, string file, size_t read_size,
+                                  Tensor *x, Tensor *y, size_t *n_read,
+                                  int nthreads) {
+  return std::thread(
+      [=]() { LoadData(flag, file, read_size, x, y, n_read, nthreads); });
+size_t ILSVRC::LoadData(int flag, string file, size_t read_size, Tensor *x,
+                        Tensor *y, size_t *n_read, int nthreads) {
+  x->Reshape(Shape{read_size, 3, kCropSize, kCropSize});
+  y->AsType(kInt);
+  y->Reshape(Shape{read_size});
+  if (file != last_read_file) {
+    if (reader != nullptr) {
+      reader->Close();
+      delete reader;
+      reader = nullptr;
+    }
+    reader = new BinFileReader();
+    reader->Open(file, 100 << 20);
+    last_read_file = file;
+  } else if (reader == nullptr) {
+    reader = new BinFileReader();
+    reader->Open(file, 100 << 20);
+  }
+  vector<string *> images;
+  for (size_t i = 0; i < read_size; i++) {
+    string image_path;
+    string *image = new string();
+    bool ret = reader->Read(&image_path, image);
+    if (ret == false) {
+      reader->Close();
+      delete reader;
+      reader = nullptr;
+      break;
+    }
+    images.push_back(image);
+  }
+  int nimg = images.size();
+  *n_read = nimg;
+  vector<std::thread> threads;
+  for (int i = 1; i < nthreads; i++) {
+    threads.push_back(AsyncDecodeTransform(flag, i, nthreads, images, x, y));
+  }
+  DecodeTransform(flag, 0, nthreads, images, x, y);
+  for (size_t i = 0; i < threads.size(); i++) threads[i].join();
+  for (int k = 0; k < nimg; k++) delete;
+  return nimg;
+std::thread ILSVRC::AsyncDecodeTransform(int flag, int thid, int nthreads,
+                                         vector<string *> images, Tensor *x,
+                                         Tensor *y) {
+  return std::thread(
+      [=]() { DecodeTransform(flag, thid, nthreads, images, x, y); });
+void ILSVRC::DecodeTransform(int flag, int thid, int nthreads,
+                             vector<string *> images, Tensor *x, Tensor *y) {
+  int nimg = images.size();
+  int start = nimg / nthreads * thid;
+  int end = start + nimg / nthreads;
+  for (int k = start; k < end; k++) {
+    std::vector<Tensor> pair = decoder->Decode(*;
+    auto tmp_image = pair[0] - mean;
+    Tensor aug_image = transformer->Apply(flag, tmp_image);
+    CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
+    CopyDataToFrom(y, pair[1], 1, k);
+  }
+  if (thid == 0) {
+    for (int k = nimg / nthreads * nthreads; k < nimg; k++) {
+      std::vector<Tensor> pair = decoder->Decode(*;
+      auto tmp_image = pair[0] - mean;
+      Tensor aug_image = transformer->Apply(flag, tmp_image);
+      CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
+      CopyDataToFrom(y, pair[1], 1, k);
+    }
+  }
+}  // namespace singa
+#endif  // USE_OPENCV
diff --git a/examples/imagenet/alexnet/ b/examples/imagenet/alexnet/
new file mode 100755
index 0000000..6277d23
--- /dev/null
+++ b/examples/imagenet/alexnet/
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions and
+# limitations under the License.
+../../build/bin/imagenet -epoch 90 -lr 0.01 -batchsize 256 -filesize 1280 -ntrain 1281167 -ntest 50000 \
+  -data "imagenet_data" -pfreq 100 -nthreads 12
diff --git a/examples/imagenet/ b/examples/imagenet/
deleted file mode 100755
index 4c2c034..0000000
--- a/examples/imagenet/
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env sh
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# See the License for the specific language governing permissions and
-# limitations under the License.
-../../build/bin/createdata -trainlist "imagenet/label/train.txt" -trainfolder "imagenet/ILSVRC2012_img_train" \
-  -testlist "imagenet/label/val.txt" -testfolder "imagenet/ILSVRC2012_img_val" -outdata "imagenet_data" -filesize 1280
diff --git a/examples/imagenet/googlenet/ b/examples/imagenet/googlenet/
new file mode 100644
index 0000000..e597fc6
--- /dev/null
+++ b/examples/imagenet/googlenet/
@@ -0,0 +1,66 @@
+name: GoogleNet on ImageNet
+SINGA version: 1.0.1
+SINGA commit: 8c990f7da2de220e8a012c6a8ecc897dc7532744
+parameter_sha1: 0a88e8948b1abca3badfd8d090d6be03f8d7655d
+license: unrestricted
+# Image Classification using GoogleNet
+In this example, we convert GoogleNet trained on Caffe to SINGA for image classification.
+## Instructions
+* Download the parameter checkpoint file into this folder
+        $ wget
+        $ tar xvf bvlc_googlenet.tar.gz
+* Run the program
+        # use cpu
+        $ python -C &
+        # use gpu
+        $ python &
+* Submit images for classification
+        $ curl -i -F image=@image1.jpg http://localhost:9999/api
+        $ curl -i -F image=@image2.jpg http://localhost:9999/api
+        $ curl -i -F image=@image3.jpg http://localhost:9999/api
+image1.jpg, image2.jpg and image3.jpg should be downloaded before executing the above commands.
+## Details
+We first extract the parameter values from [Caffe's checkpoint file]( into a pickle version
+After downloading the checkpoint file into `caffe_root/python` folder, run the following script
+    # to be executed within caffe_root/python folder
+    import caffe
+    import numpy as np
+    import cPickle as pickle
+    model_def = '../models/bvlc_googlenet/deploy.prototxt'
+    weight = 'bvlc_googlenet.caffemodel'  # must be downloaded at first
+    net = caffe.Net(model_def, weight, caffe.TEST)
+    params = {}
+    for layer_name in net.params.keys():
+        weights=np.copy(net.params[layer_name][0].data)
+        bias=np.copy(net.params[layer_name][1].data)
+        params[layer_name+'_weight']=weights
+        params[layer_name+'_bias']=bias
+        print layer_name, weights.shape, bias.shape
+    with open('bvlc_googlenet.pickle', 'wb') as fd:
+        pickle.dump(params, fd)
+Then we construct the GoogleNet using SINGA's FeedForwardNet structure.
+Note that we added a EndPadding layer to resolve the issue from discrepancy
+of the rounding strategy of the pooling layer between Caffe (ceil) and cuDNN (floor).
+Only the MaxPooling layers outside inception blocks have this problem.
+Refer to [this]( for more detials.
diff --git a/examples/imagenet/googlenet/ b/examples/imagenet/googlenet/
new file mode 100644
index 0000000..57e005d
--- /dev/null
+++ b/examples/imagenet/googlenet/
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+''' This model is created following Caffe implementation of GoogleNet
+import os
+import sys
+import time
+import numpy as np
+import threading
+import traceback
+from argparse import ArgumentParser
+from scipy.misc import imread, imresize
+import numpy as np
+from singa.layer import Layer, Conv2D, Activation, MaxPooling2D, AvgPooling2D,\
+        Split, Concat, LRN, Dropout, Flatten, Dense
+from singa import layer
+from singa import net as ffnet
+from singa import device
+from singa import tensor
+from rafiki.agent import Agent, MsgType
+def add_to_tuple(x):
+    '''return a tuple with the last two values incremented by 1'''
+    if len(x) == 3:
+        return (x[0], x[1] + 1, x[2] + 1)
+    else:
+        return (x[0], x[1], x[2] + 1, x[3] + 1)
+class EndPadding(Layer):
+    '''Pad the end of the spatial axis with 1 row and 1 column of zeros.
+    This layer is inserted before the pooling layers outside the inception
+    block. We need such a layer because Caffe (ceil) and cuDNN (floor) have
+    different rounding strategies for the pooling layer.
+    '''
+    def __init__(self, name, input_sample_shape=None):
+        super(EndPadding, self).__init__(name)
+        if input_sample_shape is not None:
+            assert len(input_sample_shape) == 3, 'input must has 4 dims'
+            self.output_sample_shape = add_to_tuple(input_sample_shape)
+    def get_output_sample_shape(self):
+        return self.output_sample_shape
+    def setup(self, input_sample_shape):
+        assert len(input_sample_shape) == 3, 'input must has 4 dims'
+        self.output_sample_shape = add_to_tuple(input_sample_shape)
+        self.has_setup = True
+    def forward(self, flag, x):
+        '''pad zeros'''
+        tmp = tensor.to_numpy(x)
+        shape = add_to_tuple(x.shape)
+        ret = np.zeros(shape)
+        ret[:,:,:-1, :-1] = tmp
+        y = tensor.from_numpy(ret)
+        y.to_device(x.device)
+        return y
+    def backward(self, falg, dy):
+        '''remove paddings'''
+        tmp = tensor.to_numpy(dy)
+        dx = tensor.from_numpy(tmp[:,:,:-1,:-1])
+        dx.to_device(dy.device)
+        return dx, []
+# b_specs = {'init': 'constant', 'value': 0, 'lr_mult': 2, 'decay_mult': 0}
+def conv(net, src, name, num, kernel, stride=1, pad=0, suffix=''):
+    net.add(Conv2D('%s/%s' % (name, suffix), num, kernel, stride, pad=pad), src)
+    return net.add(Activation('%s/relue_%s' % (name, suffix)))
+def pool(net, src, name, kernel, stride):
+    net.add(EndPadding('%s/pad' % name), src)
+    ret = net.add(MaxPooling2D('%s' % name, 3, 2, pad=0))
+    return ret
+def inception(net, src, name, nb1x1, nb3x3r, nb3x3, nb5x5r, nb5x5, nbproj):
+    split = net.add(Split('%s/split' % name, 4), src)
+    c1x1 = conv(net, split, name, nb1x1, 1, suffix='1x1')
+    c3x3r = conv(net, split, name, nb3x3r, 1, suffix='3x3_reduce')
+    c3x3 = conv(net, c3x3r, name, nb3x3, 3, pad=1, suffix='3x3')
+    c5x5r = conv(net, split, name, nb5x5r, 1, suffix='5x5_reduce')
+    c5x5 = conv(net, c5x5r, name, nb5x5, 5, pad=2, suffix='5x5')
+    pool = net.add(MaxPooling2D('%s/pool' % name, 3, 1, pad=1), split)
+    cproj = conv(net, pool, name, nbproj, 1, suffix='pool_proj')
+    return net.add(Concat('%s/output' % name, 1), [c1x1, c3x3, c5x5, cproj])
+def create_net(shape, weight_path='bvlc_googlenet.pickle'):
+    net = ffnet.FeedForwardNet()
+    net.add(Conv2D('conv1/7x7_s2', 64, 7, 2, pad=3, input_sample_shape=shape))
+    c1 = net.add(Activation('conv1/relu_7x7'))
+    pool1 = pool(net, c1, 'pool1/3x3_s2', 3, 2)
+    norm1 = net.add(LRN('pool1/norm1', 5, 0.0001, 0.75))
+    c3x3r = conv(net, norm1 , 'conv2', 64, 1, suffix='3x3_reduce')
+    c3x3 = conv(net, c3x3r, 'conv2', 192, 3, pad=1, suffix='3x3')
+    norm2 = net.add(LRN('conv2/norm2', 5, 0.0001, 0.75))
+    pool2 = pool(net, norm2, 'pool2/3x3_s2', 3, 2)
+    i3a=inception(net, pool2, 'inception_3a', 64, 96, 128, 16, 32, 32)
+    i3b=inception(net, i3a, 'inception_3b', 128, 128, 192, 32, 96, 64)
+    pool3=pool(net, i3b, 'pool3/3x3_s2', 3, 2)
+    i4a=inception(net, pool3, 'inception_4a', 192, 96, 208, 16, 48, 64)
+    i4b=inception(net, i4a, 'inception_4b', 160, 112, 224, 24, 64, 64)
+    i4c=inception(net, i4b, 'inception_4c', 128, 128, 256, 24, 64, 64)
+    i4d=inception(net, i4c, 'inception_4d', 112, 144, 288, 32, 64, 64)
+    i4e=inception(net, i4d, 'inception_4e', 256, 160, 320, 32, 128, 128)
+    pool4=pool(net, i4e,'pool4/3x3_s2', 3, 2)
+    i5a=inception(net, pool4, 'inception_5a', 256, 160, 320, 32, 128, 128)
+    i5b=inception(net, i5a, 'inception_5b', 384, 192, 384, 48, 128, 128)
+    pool5=net.add(AvgPooling2D('pool5/7x7_s1', 7, 1, pad=0))
+    drop5=net.add(Dropout('drop', 0.4))
+    flat=net.add(Flatten('flat'))
+    dense=net.add(Dense('loss3/classifier', 1000))
+    # prob=net.add(Softmax('softmax'))
+    net.load(weight_path, use_pickle=True)
+    print 'total num of params %d' % (len(net.param_names()))
+    # SINGA and Caffe have different layout for the weight matrix of the dense
+    # layer
+    for key, val in zip(net.param_names(), net.param_values()):
+        # print key
+        if key == 'loss3/classifier_weight':
+            tmp = tensor.to_numpy(val)
+            tmp = tmp.reshape(tmp.shape[::-1])
+            val.copy_from_numpy(np.transpose(tmp))
+    return net
+def serve(agent, use_cpu, parameter_file, topk=5):
+    if use_cpu:
+        print 'running with cpu'
+        dev = device.get_default_device()
+        layer.engine = 'singacpp'
+    else:
+        print "runing with gpu"
+        dev = device.create_cuda_gpu()
+    agent = agent
+    print 'Start intialization............'
+    net = create_net((3, 224, 224), parameter_file)
+    net.to_device(dev)
+    print 'End intialization............'
+    labels = np.loadtxt('synset_words.txt', str, delimiter='\t ')
+    while True:
+        key, val = agent.pull()
+        if key is None:
+            time.sleep(0.1)
+            continue
+        msg_type = MsgType.parse(key)
+        if msg_type.is_request():
+            try:
+                response = ""
+                img = imread(val['image'], mode='RGB').astype(np.float32)
+                height,width = img.shape[:2]
+                img[:, :, 0] -= 123.68
+                img[:, :, 1] -= 116.779
+                img[:, :, 2] -= 103.939
+                img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
+                img = img.transpose((2, 0, 1))
+                img = img[:,(height-224)//2:(height+224)//2,(width-224)//2:(width+224)//2]
+                images = np.expand_dims(img, axis=0)
+                x = tensor.from_numpy(images.astype(np.float32))
+                x.to_device(dev)
+                y = net.predict(x)
+                prob = np.average(tensor.to_numpy(y), 0)
+                # sort and reverse
+                idx = np.argsort(-prob)[0:topk]
+                for i in idx:
+                    response += "%s:%s<br/>" % (labels[i], prob[i])
+            except:
+                traceback.print_exc()
+                response = "Sorry, system error during prediction."
+            agent.push(MsgType.kResponse, response)
+        elif MsgType.kCommandStop.equal(msg_type):
+                print 'get stop command'
+                agent.push(MsgType.kStatus, "success")
+                break
+        else:
+            print 'get unsupported message %s' % str(msg_type)
+            agent.push(MsgType.kStatus, "Unknown command")
+            break
+        # while loop
+    print "server stop"
+def main():
+    try:
+        # Setup argument parser
+        parser = ArgumentParser(description="GooleNet for image classification")
+        parser.add_argument("-p", "--port", default=9999, help="listen port")
+        parser.add_argument("-C", "--use_cpu", action="store_true")
+        parser.add_argument("--parameter_file", default="bvlc_googlenet.pickle",
+                help="relative path")
+        # Process arguments
+        args = parser.parse_args()
+        port = args.port
+        # start to train
+        agent = Agent(port)
+        serve(agent, args.use_cpu, args.parameter_file)
+        agent.stop()
+    except SystemExit:
+        return
+    except:
+        traceback.print_exc()
+        sys.stderr.write("  for help use --help \n\n")
+        return 2
+if __name__ == '__main__':
+    main()
diff --git a/examples/imagenet/ b/examples/imagenet/
deleted file mode 100644
index c9e6d2f..0000000
--- a/examples/imagenet/
+++ /dev/null
@@ -1,70 +0,0 @@
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-#include "singa/singa_config.h"
-#ifdef USE_OPENCV
-#include "ilsvrc12.h"
-#include "singa/utils/channel.h"
-#include "singa/utils/string.h"
-int main(int argc, char **argv) {
-  int pos = singa::ArgPos(argc, argv, "-h");
-  if (pos != -1) {
-    std::cout << "Usage:\n"
-              << "\t-trainlist <file>: the file of training list;\n"
-              << "\t-trainfolder <folder>: the folder of training images;\n"
-              << "\t-testlist <file>: the file of test list;\n"
-              << "\t-testfolder <floder>: the folder of test images;\n"
-              << "\t-outdata <folder>: the folder to save output files;\n"
-              << "\t-filesize <int>: number of training images that stores in "
-                 "each binary file.\n";
-    return 0;
-  }
-  pos = singa::ArgPos(argc, argv, "-trainlist");
-  string train_image_list = "imagenet/label/train.txt";
-  if (pos != -1) train_image_list = argv[pos + 1];
-  pos = singa::ArgPos(argc, argv, "-trainfolder");
-  string train_image_folder = "imagenet/ILSVRC2012_img_train";
-  if (pos != -1) train_image_folder = argv[pos + 1];
-  pos = singa::ArgPos(argc, argv, "-testlist");
-  string test_image_list = "imagenet/label/val.txt";
-  if (pos != -1) test_image_list = argv[pos + 1];
-  pos = singa::ArgPos(argc, argv, "-testfolder");
-  string test_image_folder = "imagenet/ILSVRC2012_img_val";
-  if (pos != -1) test_image_folder = argv[pos + 1];
-  pos = singa::ArgPos(argc, argv, "-outdata");
-  string bin_folder = "imagenet_data";
-  if (pos != -1) bin_folder = argv[pos + 1];
-  pos = singa::ArgPos(argc, argv, "-filesize");
-  size_t train_file_size = 1280;
-  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
-  singa::ILSVRC data;
-  LOG(INFO) << "Creating training and test data...";
-  data.CreateTrainData(train_image_list, train_image_folder, bin_folder,
-                       train_file_size);
-  data.CreateTestData(test_image_list, test_image_folder, bin_folder);
-  LOG(INFO) << "Data created!";
-  return 0;
-#endif  // USE_OPENCV