You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@singa.apache.org by ka...@apache.org on 2017/01/24 08:52:25 UTC
[2/2] incubator-singa git commit: SINGA-295 - Add an example of image classification using GoogleNet

SINGA-295 - Add an example of image classification using GoogleNet

Add googlenet; update concat and slice.

Fix the bug from padding layers due to difference of rounding strategies between caffe and cudnn

Move the alexnet files into examples/imagenet/alexnet

Update the prediction file and the readme file according to rafiki's format.
Users can submit mutliple queries via curl.

fixed a bug in tensor.py to_numpy()

test on cpu
update setup config file to install dependent libs (flask, pillow) when
installing pysinga


Project: http://git-wip-us.apache.org/repos/asf/incubator-singa/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-singa/commit/d190fa89
Tree: http://git-wip-us.apache.org/repos/asf/incubator-singa/tree/d190fa89
Diff: http://git-wip-us.apache.org/repos/asf/incubator-singa/diff/d190fa89

Branch: refs/heads/master
Commit: d190fa89aabedb97e1e7af7a7222cadfa5187452
Parents: 8101f00
Author: Wei Wang <wa...@gmail.com>
Authored: Mon Jan 16 20:37:31 2017 +0800
Committer: Wei Wang <wa...@gmail.com>
Committed: Mon Jan 23 19:47:11 2017 +0800

----------------------------------------------------------------------
 examples/CMakeLists.txt                  |   8 +-
 examples/imagenet/CMakeLists.txt         |  34 ---
 examples/imagenet/README.md              |  58 ----
 examples/imagenet/alexnet.cc             | 402 --------------------------
 examples/imagenet/alexnet/CMakeLists.txt |  34 +++
 examples/imagenet/alexnet/README.md      |  58 ++++
 examples/imagenet/alexnet/alexnet.cc     | 402 ++++++++++++++++++++++++++
 examples/imagenet/alexnet/create_data.sh |  21 ++
 examples/imagenet/alexnet/ilsvrc12.cc    |  70 +++++
 examples/imagenet/alexnet/ilsvrc12.h     | 376 ++++++++++++++++++++++++
 examples/imagenet/alexnet/run.sh         |  21 ++
 examples/imagenet/create_data.sh         |  21 --
 examples/imagenet/googlenet/README.md    |  66 +++++
 examples/imagenet/googlenet/serve.py     | 240 +++++++++++++++
 examples/imagenet/ilsvrc12.cc            |  70 -----
 examples/imagenet/ilsvrc12.h             | 376 ------------------------
 examples/imagenet/run.sh                 |  21 --
 include/singa/core/tensor.h              |   4 +
 python/setup.py.in                       |   5 +-
 python/singa/net.py                      |   7 +-
 python/singa/tensor.py                   |   4 +-
 src/core/tensor/tensor.cc                |  57 ++++
 src/io/network/endpoint.cc               |   3 +
 src/model/layer/concat.cc                |  32 +-
 src/model/layer/concat.h                 |   5 +-
 src/model/layer/pooling.cc               |   2 +
 src/model/layer/slice.cc                 |  32 +-
 src/model/layer/slice.h                  |   6 +-
 test/CMakeLists.txt                      |   4 +-
 29 files changed, 1416 insertions(+), 1023 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 0bb6c2f..f372692 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -6,15 +6,15 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# 
+#
 
 ADD_SUBDIRECTORY(cifar10)
-ADD_SUBDIRECTORY(imagenet)
+ADD_SUBDIRECTORY(imagenet/alexnet)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/examples/imagenet/CMakeLists.txt b/examples/imagenet/CMakeLists.txt
deleted file mode 100644
index fbb7235..0000000
--- a/examples/imagenet/CMakeLists.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
-INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
-
-IF(USE_CUDNN)
-  IF(USE_OPENCV)
-  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp ")
-    ADD_EXECUTABLE(imagenet alexnet.cc)
-    ADD_DEPENDENCIES(imagenet singa)
-    TARGET_LINK_LIBRARIES(imagenet singa protobuf ${SINGA_LIBKER_LIBS})
-
-    ADD_EXECUTABLE(createdata ilsvrc12.cc)
-    ADD_DEPENDENCIES(createdata singa)
-    TARGET_LINK_LIBRARIES(createdata singa protobuf ${SINGA_LIBKER_LIBS})
-    #SET_TARGET_PROPERTIES(createdata PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  ENDIF(USE_OPENCV)
-ENDIF(USE_CUDNN)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/README.md
----------------------------------------------------------------------
diff --git a/examples/imagenet/README.md b/examples/imagenet/README.md
deleted file mode 100644
index be6797c..0000000
--- a/examples/imagenet/README.md
+++ /dev/null
@@ -1,58 +0,0 @@
-# Train AlexNet over ImageNet
-
-Convolution neural network (CNN) is a type of feed-forward neural
-network widely used for image and video classification. In this example, we will
-use a [deep CNN model](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)
-to do image classification against the ImageNet dataset.
-
-## Instructions
-
-### Compile SINGA
-
-Please compile SINGA with CUDA, CUDNN and OpenCV. You can manually turn on the
-options in CMakeLists.txt or run `ccmake ..` in build/ folder.
-
-We have tested CUDNN V4 and V5 (V5 requires CUDA 7.5)
-
-### Data download
-* Please refer to step1-3 on [Instructions to create ImageNet 2012 data](https://github.com/amd/OpenCL-caffe/wiki/Instructions-to-create-ImageNet-2012-data)
-  to download and decompress the data.
-* You can download the training and validation list by
-  [get_ilsvrc_aux.sh](https://github.com/BVLC/caffe/blob/master/data/ilsvrc12/get_ilsvrc_aux.sh)
-  or from [Imagenet](http://www.image-net.org/download-images).
-
-### Data preprocessing
-* Assuming you have downloaded the data and the list.
-  Now we should transform the data into binary files. You can run:
-
-          sh create_data.sh
-
-  The script will generate a test file(`test.bin`), a mean file(`mean.bin`) and
-  several training files(`trainX.bin`) in the specified output folder.
-* You can also change the parameters in `create_data.sh`.
-  + `-trainlist <file>`: the file of training list;
-  + `-trainfolder <folder>`: the folder of training images;
-  + `-testlist <file>`: the file of test list;
-  + `-testfolder <floder>`: the folder of test images;
-  + `-outdata <folder>`: the folder to save output files, including mean, training and test files.
-    The script will generate these files in the specified folder;
-  + `-filesize <int>`: number of training images that stores in each binary file.
-
-### Training
-* After preparing data, you can run the following command to train the Alexnet model.
-
-          sh run.sh
-
-* You may change the parameters in `run.sh`.
-  + `-epoch <int>`: number of epoch to be trained, default is 90;
-  + `-lr <float>`: base learning rate, the learning rate will decrease each 20 epochs,
-    more specifically, `lr = lr * exp(0.1 * (epoch / 20))`;
-  + `-batchsize <int>`: batchsize, it should be changed regarding to your memory;
-  + `-filesize <int>`: number of training images that stores in each binary file, it is the
-    same as the `filesize` in data preprocessing;
-  + `-ntrain <int>`: number of training images;
-  + `-ntest <int>`: number of test images;
-  + `-data <folder>`: the folder which stores the binary files, it is exactly the output
-    folder in data preprocessing step;
-  + `-pfreq <int>`: the frequency(in batch) of printing current model status(loss and accuracy);
-  + `-nthreads <int>`: the number of threads to load data which feed to the model.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet.cc b/examples/imagenet/alexnet.cc
deleted file mode 100644
index 4ac1130..0000000
--- a/examples/imagenet/alexnet.cc
+++ /dev/null
@@ -1,402 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-
-#include "singa/singa_config.h"
-#ifdef USE_OPENCV
-#include <cmath>
-#include "./ilsvrc12.h"
-#include "singa/io/snapshot.h"
-#include "singa/model/feed_forward_net.h"
-#include "singa/model/initializer.h"
-#include "singa/model/metric.h"
-#include "singa/model/optimizer.h"
-#include "singa/utils/channel.h"
-#include "singa/utils/string.h"
-#include "singa/utils/timer.h"
-namespace singa {
-
-// currently supports 'cudnn' and 'singacpp'
-const std::string engine = "cudnn";
-LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
-                      int pad, float std, float bias = .0f) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_convolution");
-  ConvolutionConf *conv = conf.mutable_convolution_conf();
-  conv->set_num_output(nb_filter);
-  conv->add_kernel_size(kernel);
-  conv->add_stride(stride);
-  conv->add_pad(pad);
-  conv->set_bias_term(true);
-
-  ParamSpec *wspec = conf.add_param();
-  wspec->set_name(name + "_weight");
-  auto wfill = wspec->mutable_filler();
-  wfill->set_type("Gaussian");
-  wfill->set_std(std);
-
-  ParamSpec *bspec = conf.add_param();
-  bspec->set_name(name + "_bias");
-  bspec->set_lr_mult(2);
-  bspec->set_decay_mult(0);
-  auto bfill = bspec->mutable_filler();
-  bfill->set_value(bias);
-  return conf;
-}
-
-LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
-                         int pad) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_pooling");
-  PoolingConf *pool = conf.mutable_pooling_conf();
-  pool->set_kernel_size(kernel);
-  pool->set_stride(stride);
-  pool->set_pad(pad);
-  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
-  return conf;
-}
-
-LayerConf GenReLUConf(string name) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_relu");
-  return conf;
-}
-
-LayerConf GenDenseConf(string name, int num_output, float std, float wd,
-                       float bias = .0f) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type("singa_dense");
-  DenseConf *dense = conf.mutable_dense_conf();
-  dense->set_num_output(num_output);
-
-  ParamSpec *wspec = conf.add_param();
-  wspec->set_name(name + "_weight");
-  wspec->set_decay_mult(wd);
-  auto wfill = wspec->mutable_filler();
-  wfill->set_type("Gaussian");
-  wfill->set_std(std);
-
-  ParamSpec *bspec = conf.add_param();
-  bspec->set_name(name + "_bias");
-  bspec->set_lr_mult(2);
-  bspec->set_decay_mult(0);
-  auto bfill = bspec->mutable_filler();
-  bfill->set_value(bias);
-
-  return conf;
-}
-
-LayerConf GenLRNConf(string name) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_lrn");
-  LRNConf *lrn = conf.mutable_lrn_conf();
-  lrn->set_local_size(5);
-  lrn->set_alpha(1e-04);
-  lrn->set_beta(0.75);
-  return conf;
-}
-
-LayerConf GenFlattenConf(string name) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type("singa_flatten");
-  return conf;
-}
-
-LayerConf GenDropoutConf(string name, float dropout_ratio) {
-  LayerConf conf;
-  conf.set_name(name);
-  conf.set_type(engine + "_dropout");
-  DropoutConf *dropout = conf.mutable_dropout_conf();
-  dropout->set_dropout_ratio(dropout_ratio);
-  return conf;
-}
-
-FeedForwardNet CreateNet() {
-  FeedForwardNet net;
-  Shape s{3, 227, 227};
-
-  net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
-  net.Add(GenReLUConf("relu1"));
-  net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
-  net.Add(GenLRNConf("lrn1"));
-  net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
-  net.Add(GenReLUConf("relu2"));
-  net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
-  net.Add(GenLRNConf("lrn2"));
-  net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
-  net.Add(GenReLUConf("relu3"));
-  net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
-  net.Add(GenReLUConf("relu4"));
-  net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
-  net.Add(GenReLUConf("relu5"));
-  net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
-  net.Add(GenFlattenConf("flat"));
-  net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
-  net.Add(GenReLUConf("relu6"));
-  net.Add(GenDropoutConf("drop6", 0.5));
-  net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
-  net.Add(GenReLUConf("relu7"));
-  net.Add(GenDropoutConf("drop7", 0.5));
-  net.Add(GenDenseConf("ip8", 1000, 0.01, 1));
-
-  return net;
-}
-
-void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
-                   std::shared_ptr<Device> device, int epoch, string bin_folder,
-                   size_t num_train_files, size_t batchsize, float lr,
-                   Channel *train_ch, size_t pfreq, int nthreads) {
-  float loss = 0.0f, metric = 0.0f;
-  float load_time = 0.0f, train_time = 0.0f;
-  size_t b = 0;
-  size_t n_read;
-  Timer timer, ttr;
-  Tensor prefetch_x, prefetch_y;
-  string binfile = bin_folder + "/train1.bin";
-  timer.Tick();
-  data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
-                nthreads);
-  load_time += timer.Elapsed();
-  CHECK_EQ(n_read, batchsize);
-  Tensor train_x(prefetch_x.shape(), device);
-  Tensor train_y(prefetch_y.shape(), device, kInt);
-  std::thread th;
-  for (size_t fno = 1; fno <= num_train_files; fno++) {
-    binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
-    while (true) {
-      if (th.joinable()) {
-        th.join();
-        load_time += timer.Elapsed();
-        // LOG(INFO) << "num of samples: " << n_read;
-        if (n_read < batchsize) {
-          if (n_read > 0) {
-            LOG(WARNING) << "Pls set batchsize to make num_total_samples "
-                         << "% batchsize == 0. Otherwise, the last " << n_read
-                         << " samples would not be used";
-          }
-          break;
-        }
-      }
-      if (n_read == batchsize) {
-        train_x.CopyData(prefetch_x);
-        train_y.CopyData(prefetch_y);
-      }
-      timer.Tick();
-      th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
-                              &prefetch_y, &n_read, nthreads);
-      if (n_read < batchsize) continue;
-      CHECK_EQ(train_x.shape(0), train_y.shape(0));
-      ttr.Tick();
-      auto ret = net.TrainOnBatch(epoch, train_x, train_y);
-      train_time += ttr.Elapsed();
-      loss += ret.first;
-      metric += ret.second;
-      b++;
-    }
-    if (b % pfreq == 0) {
-      train_ch->Send(
-          "Epoch " + std::to_string(epoch) + ", training loss = " +
-          std::to_string(loss / b) + ", accuracy = " +
-          std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
-          ", time of loading " + std::to_string(batchsize) + " images = " +
-          std::to_string(load_time / b) +
-          " ms, time of training (batchsize = " + std::to_string(batchsize) +
-          ") = " + std::to_string(train_time / b) + " ms.");
-      loss = 0.0f;
-      metric = 0.0f;
-      load_time = 0.0f;
-      train_time = 0.0f;
-      b = 0;
-    }
-  }
-}
-
-void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
-                  std::shared_ptr<Device> device, int epoch, string bin_folder,
-                  size_t num_test_images, size_t batchsize, Channel *val_ch,
-                  int nthreads) {
-  float loss = 0.0f, metric = 0.0f;
-  float load_time = 0.0f, eval_time = 0.0f;
-  size_t n_read;
-  string binfile = bin_folder + "/test.bin";
-  Timer timer, tte;
-  Tensor prefetch_x, prefetch_y;
-  timer.Tick();
-  data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
-                nthreads);
-  load_time += timer.Elapsed();
-  Tensor test_x(prefetch_x.shape(), device);
-  Tensor test_y(prefetch_y.shape(), device, kInt);
-  int remain = (int)num_test_images - n_read;
-  CHECK_EQ(n_read, batchsize);
-  std::thread th;
-  while (true) {
-    if (th.joinable()) {
-      th.join();
-      load_time += timer.Elapsed();
-      remain -= n_read;
-      if (remain < 0) break;
-      if (n_read < batchsize) break;
-    }
-    test_x.CopyData(prefetch_x);
-    test_y.CopyData(prefetch_y);
-    timer.Tick();
-    th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
-                            &n_read, nthreads);
-
-    CHECK_EQ(test_x.shape(0), test_y.shape(0));
-    tte.Tick();
-    auto ret = net.EvaluateOnBatch(test_x, test_y);
-    eval_time += tte.Elapsed();
-    ret.first.ToHost();
-    ret.second.ToHost();
-    loss += Sum(ret.first);
-    metric += Sum(ret.second);
-  }
-  loss /= num_test_images;
-  metric /= num_test_images;
-  val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
-               std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
-               ", time of loading " + std::to_string(num_test_images) +
-               " images = " + std::to_string(load_time) +
-               " ms, time of evaluating " + std::to_string(num_test_images) +
-               " images = " + std::to_string(eval_time) + " ms.");
-}
-
-void Checkpoint(FeedForwardNet &net, string prefix) {
-  Snapshot snapshot(prefix, Snapshot::kWrite, 200);
-  auto names = net.GetParamNames();
-  auto values = net.GetParamValues();
-  for (size_t k = 0; k < names.size(); k++) {
-    values.at(k).ToHost();
-    snapshot.Write(names.at(k), values.at(k));
-  }
-  LOG(INFO) << "Write snapshot into " << prefix;
-}
-
-void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
-           string bin_folder, size_t num_train_images, size_t num_test_images,
-           size_t pfreq, int nthreads) {
-  ILSVRC data;
-  data.ReadMean(bin_folder + "/mean.bin");
-  auto net = CreateNet();
-  auto cuda = std::make_shared<CudaGPU>(0);
-  net.ToDevice(cuda);
-  SGD sgd;
-  OptimizerConf opt_conf;
-  opt_conf.set_momentum(0.9);
-  auto reg = opt_conf.mutable_regularizer();
-  reg->set_coefficient(0.0005);
-  sgd.Setup(opt_conf);
-  sgd.SetLearningRateGenerator(
-      [lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });
-
-  SoftmaxCrossEntropy loss;
-  Accuracy acc;
-  net.Compile(true, &sgd, &loss, &acc);
-
-  Channel *train_ch = GetChannel("train_perf");
-  train_ch->EnableDestStderr(true);
-  Channel *val_ch = GetChannel("val_perf");
-  val_ch->EnableDestStderr(true);
-  size_t num_train_files = num_train_images / train_file_size +
-                           (num_train_images % train_file_size ? 1 : 0);
-  for (int epoch = 0; epoch < num_epoch; epoch++) {
-    float epoch_lr = sgd.GetLearningRate(epoch);
-    TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
-                  batchsize, epoch_lr, train_ch, pfreq, nthreads);
-    if (epoch % 10 == 0 && epoch > 0) {
-      string prefix = "snapshot_epoch" + std::to_string(epoch);
-      Checkpoint(net, prefix);
-    }
-    TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
-                 val_ch, nthreads);
-  }
-}
-}
-
-int main(int argc, char **argv) {
-  singa::InitChannel(nullptr);
-  int pos = singa::ArgPos(argc, argv, "-h");
-  if (pos != -1) {
-    std::cout << "Usage:\n"
-              << "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
-              << "\t-lr <float>: base learning rate;\n"
-              << "\t-batchsize <int>: batchsize, it should be changed regarding "
-                 "to your memory;\n"
-              << "\t-filesize <int>: number of training images that stores in "
-                 "each binary file;\n"
-              << "\t-ntrain <int>: number of training images;\n"
-              << "\t-ntest <int>: number of test images;\n"
-              << "\t-data <folder>: the folder which stores the binary files;\n"
-              << "\t-pfreq <int>: the frequency(in batch) of printing current "
-                 "model status(loss and accuracy);\n"
-              << "\t-nthreads <int>`: the number of threads to load data which "
-                 "feed to the model.\n";
-    return 0;
-  }
-  pos = singa::ArgPos(argc, argv, "-epoch");
-  int nEpoch = 90;
-  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-lr");
-  float lr = 0.01;
-  if (pos != -1) lr = atof(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-batchsize");
-  int batchsize = 256;
-  if (pos != -1) batchsize = atof(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-filesize");
-  size_t train_file_size = 1280;
-  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-ntrain");
-  size_t num_train_images = 1281167;
-  if (pos != -1) num_train_images = atoi(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-ntest");
-  size_t num_test_images = 50000;
-  if (pos != -1) num_test_images = atoi(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-data");
-  string bin_folder = "imagenet_data";
-  if (pos != -1) bin_folder = argv[pos + 1];
-
-  pos = singa::ArgPos(argc, argv, "-pfreq");
-  size_t pfreq = 100;
-  if (pos != -1) pfreq = atoi(argv[pos + 1]);
-
-  pos = singa::ArgPos(argc, argv, "-nthreads");
-  int nthreads = 12;
-  if (pos != -1) nthreads = atoi(argv[pos + 1]);
-
-  LOG(INFO) << "Start training";
-  singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
-               num_train_images, num_test_images, pfreq, nthreads);
-  LOG(INFO) << "End training";
-}
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/CMakeLists.txt b/examples/imagenet/alexnet/CMakeLists.txt
new file mode 100644
index 0000000..fbb7235
--- /dev/null
+++ b/examples/imagenet/alexnet/CMakeLists.txt
@@ -0,0 +1,34 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
+
+IF(USE_CUDNN)
+  IF(USE_OPENCV)
+  SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp ")
+    ADD_EXECUTABLE(imagenet alexnet.cc)
+    ADD_DEPENDENCIES(imagenet singa)
+    TARGET_LINK_LIBRARIES(imagenet singa protobuf ${SINGA_LIBKER_LIBS})
+
+    ADD_EXECUTABLE(createdata ilsvrc12.cc)
+    ADD_DEPENDENCIES(createdata singa)
+    TARGET_LINK_LIBRARIES(createdata singa protobuf ${SINGA_LIBKER_LIBS})
+    #SET_TARGET_PROPERTIES(createdata PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  ENDIF(USE_OPENCV)
+ENDIF(USE_CUDNN)

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/README.md
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/README.md b/examples/imagenet/alexnet/README.md
new file mode 100644
index 0000000..be6797c
--- /dev/null
+++ b/examples/imagenet/alexnet/README.md
@@ -0,0 +1,58 @@
+# Train AlexNet over ImageNet
+
+Convolution neural network (CNN) is a type of feed-forward neural
+network widely used for image and video classification. In this example, we will
+use a [deep CNN model](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)
+to do image classification against the ImageNet dataset.
+
+## Instructions
+
+### Compile SINGA
+
+Please compile SINGA with CUDA, CUDNN and OpenCV. You can manually turn on the
+options in CMakeLists.txt or run `ccmake ..` in build/ folder.
+
+We have tested CUDNN V4 and V5 (V5 requires CUDA 7.5)
+
+### Data download
+* Please refer to step1-3 on [Instructions to create ImageNet 2012 data](https://github.com/amd/OpenCL-caffe/wiki/Instructions-to-create-ImageNet-2012-data)
+  to download and decompress the data.
+* You can download the training and validation list by
+  [get_ilsvrc_aux.sh](https://github.com/BVLC/caffe/blob/master/data/ilsvrc12/get_ilsvrc_aux.sh)
+  or from [Imagenet](http://www.image-net.org/download-images).
+
+### Data preprocessing
+* Assuming you have downloaded the data and the list.
+  Now we should transform the data into binary files. You can run:
+
+          sh create_data.sh
+
+  The script will generate a test file(`test.bin`), a mean file(`mean.bin`) and
+  several training files(`trainX.bin`) in the specified output folder.
+* You can also change the parameters in `create_data.sh`.
+  + `-trainlist <file>`: the file of training list;
+  + `-trainfolder <folder>`: the folder of training images;
+  + `-testlist <file>`: the file of test list;
+  + `-testfolder <floder>`: the folder of test images;
+  + `-outdata <folder>`: the folder to save output files, including mean, training and test files.
+    The script will generate these files in the specified folder;
+  + `-filesize <int>`: number of training images that stores in each binary file.
+
+### Training
+* After preparing data, you can run the following command to train the Alexnet model.
+
+          sh run.sh
+
+* You may change the parameters in `run.sh`.
+  + `-epoch <int>`: number of epoch to be trained, default is 90;
+  + `-lr <float>`: base learning rate, the learning rate will decrease each 20 epochs,
+    more specifically, `lr = lr * exp(0.1 * (epoch / 20))`;
+  + `-batchsize <int>`: batchsize, it should be changed regarding to your memory;
+  + `-filesize <int>`: number of training images that stores in each binary file, it is the
+    same as the `filesize` in data preprocessing;
+  + `-ntrain <int>`: number of training images;
+  + `-ntest <int>`: number of test images;
+  + `-data <folder>`: the folder which stores the binary files, it is exactly the output
+    folder in data preprocessing step;
+  + `-pfreq <int>`: the frequency(in batch) of printing current model status(loss and accuracy);
+  + `-nthreads <int>`: the number of threads to load data which feed to the model.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/alexnet.cc
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/alexnet.cc b/examples/imagenet/alexnet/alexnet.cc
new file mode 100644
index 0000000..4ac1130
--- /dev/null
+++ b/examples/imagenet/alexnet/alexnet.cc
@@ -0,0 +1,402 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include <cmath>
+#include "./ilsvrc12.h"
+#include "singa/io/snapshot.h"
+#include "singa/model/feed_forward_net.h"
+#include "singa/model/initializer.h"
+#include "singa/model/metric.h"
+#include "singa/model/optimizer.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+#include "singa/utils/timer.h"
+namespace singa {
+
+// currently supports 'cudnn' and 'singacpp'
+const std::string engine = "cudnn";
+LayerConf GenConvConf(string name, int nb_filter, int kernel, int stride,
+                      int pad, float std, float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_convolution");
+  ConvolutionConf *conv = conf.mutable_convolution_conf();
+  conv->set_num_output(nb_filter);
+  conv->add_kernel_size(kernel);
+  conv->add_stride(stride);
+  conv->add_pad(pad);
+  conv->set_bias_term(true);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+  return conf;
+}
+
+LayerConf GenPoolingConf(string name, bool max_pool, int kernel, int stride,
+                         int pad) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_pooling");
+  PoolingConf *pool = conf.mutable_pooling_conf();
+  pool->set_kernel_size(kernel);
+  pool->set_stride(stride);
+  pool->set_pad(pad);
+  if (!max_pool) pool->set_pool(PoolingConf_PoolMethod_AVE);
+  return conf;
+}
+
+LayerConf GenReLUConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_relu");
+  return conf;
+}
+
+LayerConf GenDenseConf(string name, int num_output, float std, float wd,
+                       float bias = .0f) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_dense");
+  DenseConf *dense = conf.mutable_dense_conf();
+  dense->set_num_output(num_output);
+
+  ParamSpec *wspec = conf.add_param();
+  wspec->set_name(name + "_weight");
+  wspec->set_decay_mult(wd);
+  auto wfill = wspec->mutable_filler();
+  wfill->set_type("Gaussian");
+  wfill->set_std(std);
+
+  ParamSpec *bspec = conf.add_param();
+  bspec->set_name(name + "_bias");
+  bspec->set_lr_mult(2);
+  bspec->set_decay_mult(0);
+  auto bfill = bspec->mutable_filler();
+  bfill->set_value(bias);
+
+  return conf;
+}
+
+LayerConf GenLRNConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_lrn");
+  LRNConf *lrn = conf.mutable_lrn_conf();
+  lrn->set_local_size(5);
+  lrn->set_alpha(1e-04);
+  lrn->set_beta(0.75);
+  return conf;
+}
+
+LayerConf GenFlattenConf(string name) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type("singa_flatten");
+  return conf;
+}
+
+LayerConf GenDropoutConf(string name, float dropout_ratio) {
+  LayerConf conf;
+  conf.set_name(name);
+  conf.set_type(engine + "_dropout");
+  DropoutConf *dropout = conf.mutable_dropout_conf();
+  dropout->set_dropout_ratio(dropout_ratio);
+  return conf;
+}
+
+FeedForwardNet CreateNet() {
+  FeedForwardNet net;
+  Shape s{3, 227, 227};
+
+  net.Add(GenConvConf("conv1", 96, 11, 4, 0, 0.01), &s);
+  net.Add(GenReLUConf("relu1"));
+  net.Add(GenPoolingConf("pool1", true, 3, 2, 0));
+  net.Add(GenLRNConf("lrn1"));
+  net.Add(GenConvConf("conv2", 256, 5, 1, 2, 0.01, 1.0));
+  net.Add(GenReLUConf("relu2"));
+  net.Add(GenPoolingConf("pool2", true, 3, 2, 0));
+  net.Add(GenLRNConf("lrn2"));
+  net.Add(GenConvConf("conv3", 384, 3, 1, 1, 0.01));
+  net.Add(GenReLUConf("relu3"));
+  net.Add(GenConvConf("conv4", 384, 3, 1, 1, 0.01, 1.0));
+  net.Add(GenReLUConf("relu4"));
+  net.Add(GenConvConf("conv5", 256, 3, 1, 1, 0.01, 1.0));
+  net.Add(GenReLUConf("relu5"));
+  net.Add(GenPoolingConf("pool5", true, 3, 2, 0));
+  net.Add(GenFlattenConf("flat"));
+  net.Add(GenDenseConf("ip6", 4096, 0.005, 1, 1.0));
+  net.Add(GenReLUConf("relu6"));
+  net.Add(GenDropoutConf("drop6", 0.5));
+  net.Add(GenDenseConf("ip7", 4096, 0.005, 1, 1.0));
+  net.Add(GenReLUConf("relu7"));
+  net.Add(GenDropoutConf("drop7", 0.5));
+  net.Add(GenDenseConf("ip8", 1000, 0.01, 1));
+
+  return net;
+}
+
+void TrainOneEpoch(FeedForwardNet &net, ILSVRC &data,
+                   std::shared_ptr<Device> device, int epoch, string bin_folder,
+                   size_t num_train_files, size_t batchsize, float lr,
+                   Channel *train_ch, size_t pfreq, int nthreads) {
+  float loss = 0.0f, metric = 0.0f;
+  float load_time = 0.0f, train_time = 0.0f;
+  size_t b = 0;
+  size_t n_read;
+  Timer timer, ttr;
+  Tensor prefetch_x, prefetch_y;
+  string binfile = bin_folder + "/train1.bin";
+  timer.Tick();
+  data.LoadData(kTrain, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
+                nthreads);
+  load_time += timer.Elapsed();
+  CHECK_EQ(n_read, batchsize);
+  Tensor train_x(prefetch_x.shape(), device);
+  Tensor train_y(prefetch_y.shape(), device, kInt);
+  std::thread th;
+  for (size_t fno = 1; fno <= num_train_files; fno++) {
+    binfile = bin_folder + "/train" + std::to_string(fno) + ".bin";
+    while (true) {
+      if (th.joinable()) {
+        th.join();
+        load_time += timer.Elapsed();
+        // LOG(INFO) << "num of samples: " << n_read;
+        if (n_read < batchsize) {
+          if (n_read > 0) {
+            LOG(WARNING) << "Pls set batchsize to make num_total_samples "
+                         << "% batchsize == 0. Otherwise, the last " << n_read
+                         << " samples would not be used";
+          }
+          break;
+        }
+      }
+      if (n_read == batchsize) {
+        train_x.CopyData(prefetch_x);
+        train_y.CopyData(prefetch_y);
+      }
+      timer.Tick();
+      th = data.AsyncLoadData(kTrain, binfile, batchsize, &prefetch_x,
+                              &prefetch_y, &n_read, nthreads);
+      if (n_read < batchsize) continue;
+      CHECK_EQ(train_x.shape(0), train_y.shape(0));
+      ttr.Tick();
+      auto ret = net.TrainOnBatch(epoch, train_x, train_y);
+      train_time += ttr.Elapsed();
+      loss += ret.first;
+      metric += ret.second;
+      b++;
+    }
+    if (b % pfreq == 0) {
+      train_ch->Send(
+          "Epoch " + std::to_string(epoch) + ", training loss = " +
+          std::to_string(loss / b) + ", accuracy = " +
+          std::to_string(metric / b) + ", lr = " + std::to_string(lr) +
+          ", time of loading " + std::to_string(batchsize) + " images = " +
+          std::to_string(load_time / b) +
+          " ms, time of training (batchsize = " + std::to_string(batchsize) +
+          ") = " + std::to_string(train_time / b) + " ms.");
+      loss = 0.0f;
+      metric = 0.0f;
+      load_time = 0.0f;
+      train_time = 0.0f;
+      b = 0;
+    }
+  }
+}
+
+void TestOneEpoch(FeedForwardNet &net, ILSVRC &data,
+                  std::shared_ptr<Device> device, int epoch, string bin_folder,
+                  size_t num_test_images, size_t batchsize, Channel *val_ch,
+                  int nthreads) {
+  float loss = 0.0f, metric = 0.0f;
+  float load_time = 0.0f, eval_time = 0.0f;
+  size_t n_read;
+  string binfile = bin_folder + "/test.bin";
+  Timer timer, tte;
+  Tensor prefetch_x, prefetch_y;
+  timer.Tick();
+  data.LoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y, &n_read,
+                nthreads);
+  load_time += timer.Elapsed();
+  Tensor test_x(prefetch_x.shape(), device);
+  Tensor test_y(prefetch_y.shape(), device, kInt);
+  int remain = (int)num_test_images - n_read;
+  CHECK_EQ(n_read, batchsize);
+  std::thread th;
+  while (true) {
+    if (th.joinable()) {
+      th.join();
+      load_time += timer.Elapsed();
+      remain -= n_read;
+      if (remain < 0) break;
+      if (n_read < batchsize) break;
+    }
+    test_x.CopyData(prefetch_x);
+    test_y.CopyData(prefetch_y);
+    timer.Tick();
+    th = data.AsyncLoadData(kEval, binfile, batchsize, &prefetch_x, &prefetch_y,
+                            &n_read, nthreads);
+
+    CHECK_EQ(test_x.shape(0), test_y.shape(0));
+    tte.Tick();
+    auto ret = net.EvaluateOnBatch(test_x, test_y);
+    eval_time += tte.Elapsed();
+    ret.first.ToHost();
+    ret.second.ToHost();
+    loss += Sum(ret.first);
+    metric += Sum(ret.second);
+  }
+  loss /= num_test_images;
+  metric /= num_test_images;
+  val_ch->Send("Epoch " + std::to_string(epoch) + ", val loss = " +
+               std::to_string(loss) + ", accuracy = " + std::to_string(metric) +
+               ", time of loading " + std::to_string(num_test_images) +
+               " images = " + std::to_string(load_time) +
+               " ms, time of evaluating " + std::to_string(num_test_images) +
+               " images = " + std::to_string(eval_time) + " ms.");
+}
+
+void Checkpoint(FeedForwardNet &net, string prefix) {
+  Snapshot snapshot(prefix, Snapshot::kWrite, 200);
+  auto names = net.GetParamNames();
+  auto values = net.GetParamValues();
+  for (size_t k = 0; k < names.size(); k++) {
+    values.at(k).ToHost();
+    snapshot.Write(names.at(k), values.at(k));
+  }
+  LOG(INFO) << "Write snapshot into " << prefix;
+}
+
+void Train(int num_epoch, float lr, size_t batchsize, size_t train_file_size,
+           string bin_folder, size_t num_train_images, size_t num_test_images,
+           size_t pfreq, int nthreads) {
+  ILSVRC data;
+  data.ReadMean(bin_folder + "/mean.bin");
+  auto net = CreateNet();
+  auto cuda = std::make_shared<CudaGPU>(0);
+  net.ToDevice(cuda);
+  SGD sgd;
+  OptimizerConf opt_conf;
+  opt_conf.set_momentum(0.9);
+  auto reg = opt_conf.mutable_regularizer();
+  reg->set_coefficient(0.0005);
+  sgd.Setup(opt_conf);
+  sgd.SetLearningRateGenerator(
+      [lr](int epoch) { return lr * std::pow(0.1, epoch / 20); });
+
+  SoftmaxCrossEntropy loss;
+  Accuracy acc;
+  net.Compile(true, &sgd, &loss, &acc);
+
+  Channel *train_ch = GetChannel("train_perf");
+  train_ch->EnableDestStderr(true);
+  Channel *val_ch = GetChannel("val_perf");
+  val_ch->EnableDestStderr(true);
+  size_t num_train_files = num_train_images / train_file_size +
+                           (num_train_images % train_file_size ? 1 : 0);
+  for (int epoch = 0; epoch < num_epoch; epoch++) {
+    float epoch_lr = sgd.GetLearningRate(epoch);
+    TrainOneEpoch(net, data, cuda, epoch, bin_folder, num_train_files,
+                  batchsize, epoch_lr, train_ch, pfreq, nthreads);
+    if (epoch % 10 == 0 && epoch > 0) {
+      string prefix = "snapshot_epoch" + std::to_string(epoch);
+      Checkpoint(net, prefix);
+    }
+    TestOneEpoch(net, data, cuda, epoch, bin_folder, num_test_images, batchsize,
+                 val_ch, nthreads);
+  }
+}
+}
+
+int main(int argc, char **argv) {
+  singa::InitChannel(nullptr);
+  int pos = singa::ArgPos(argc, argv, "-h");
+  if (pos != -1) {
+    std::cout << "Usage:\n"
+              << "\t-epoch <int>: number of epoch to be trained, default is 90;\n"
+              << "\t-lr <float>: base learning rate;\n"
+              << "\t-batchsize <int>: batchsize, it should be changed regarding "
+                 "to your memory;\n"
+              << "\t-filesize <int>: number of training images that stores in "
+                 "each binary file;\n"
+              << "\t-ntrain <int>: number of training images;\n"
+              << "\t-ntest <int>: number of test images;\n"
+              << "\t-data <folder>: the folder which stores the binary files;\n"
+              << "\t-pfreq <int>: the frequency(in batch) of printing current "
+                 "model status(loss and accuracy);\n"
+              << "\t-nthreads <int>`: the number of threads to load data which "
+                 "feed to the model.\n";
+    return 0;
+  }
+  pos = singa::ArgPos(argc, argv, "-epoch");
+  int nEpoch = 90;
+  if (pos != -1) nEpoch = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-lr");
+  float lr = 0.01;
+  if (pos != -1) lr = atof(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-batchsize");
+  int batchsize = 256;
+  if (pos != -1) batchsize = atof(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-filesize");
+  size_t train_file_size = 1280;
+  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-ntrain");
+  size_t num_train_images = 1281167;
+  if (pos != -1) num_train_images = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-ntest");
+  size_t num_test_images = 50000;
+  if (pos != -1) num_test_images = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-data");
+  string bin_folder = "imagenet_data";
+  if (pos != -1) bin_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-pfreq");
+  size_t pfreq = 100;
+  if (pos != -1) pfreq = atoi(argv[pos + 1]);
+
+  pos = singa::ArgPos(argc, argv, "-nthreads");
+  int nthreads = 12;
+  if (pos != -1) nthreads = atoi(argv[pos + 1]);
+
+  LOG(INFO) << "Start training";
+  singa::Train(nEpoch, lr, batchsize, train_file_size, bin_folder,
+               num_train_images, num_test_images, pfreq, nthreads);
+  LOG(INFO) << "End training";
+}
+#endif

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/create_data.sh
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/create_data.sh b/examples/imagenet/alexnet/create_data.sh
new file mode 100755
index 0000000..4c2c034
--- /dev/null
+++ b/examples/imagenet/alexnet/create_data.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+../../build/bin/createdata -trainlist "imagenet/label/train.txt" -trainfolder "imagenet/ILSVRC2012_img_train" \
+  -testlist "imagenet/label/val.txt" -testfolder "imagenet/ILSVRC2012_img_val" -outdata "imagenet_data" -filesize 1280

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/ilsvrc12.cc
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/ilsvrc12.cc b/examples/imagenet/alexnet/ilsvrc12.cc
new file mode 100644
index 0000000..c9e6d2f
--- /dev/null
+++ b/examples/imagenet/alexnet/ilsvrc12.cc
@@ -0,0 +1,70 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#include "ilsvrc12.h"
+#include "singa/utils/channel.h"
+#include "singa/utils/string.h"
+int main(int argc, char **argv) {
+  int pos = singa::ArgPos(argc, argv, "-h");
+  if (pos != -1) {
+    std::cout << "Usage:\n"
+              << "\t-trainlist <file>: the file of training list;\n"
+              << "\t-trainfolder <folder>: the folder of training images;\n"
+              << "\t-testlist <file>: the file of test list;\n"
+              << "\t-testfolder <floder>: the folder of test images;\n"
+              << "\t-outdata <folder>: the folder to save output files;\n"
+              << "\t-filesize <int>: number of training images that stores in "
+                 "each binary file.\n";
+    return 0;
+  }
+  pos = singa::ArgPos(argc, argv, "-trainlist");
+  string train_image_list = "imagenet/label/train.txt";
+  if (pos != -1) train_image_list = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-trainfolder");
+  string train_image_folder = "imagenet/ILSVRC2012_img_train";
+  if (pos != -1) train_image_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-testlist");
+  string test_image_list = "imagenet/label/val.txt";
+  if (pos != -1) test_image_list = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-testfolder");
+  string test_image_folder = "imagenet/ILSVRC2012_img_val";
+  if (pos != -1) test_image_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-outdata");
+  string bin_folder = "imagenet_data";
+  if (pos != -1) bin_folder = argv[pos + 1];
+
+  pos = singa::ArgPos(argc, argv, "-filesize");
+  size_t train_file_size = 1280;
+  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
+  singa::ILSVRC data;
+  LOG(INFO) << "Creating training and test data...";
+  data.CreateTrainData(train_image_list, train_image_folder, bin_folder,
+                       train_file_size);
+  data.CreateTestData(test_image_list, test_image_folder, bin_folder);
+  LOG(INFO) << "Data created!";
+  return 0;
+}
+#endif  // USE_OPENCV

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/ilsvrc12.h
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/ilsvrc12.h b/examples/imagenet/alexnet/ilsvrc12.h
new file mode 100644
index 0000000..74fffbb
--- /dev/null
+++ b/examples/imagenet/alexnet/ilsvrc12.h
@@ -0,0 +1,376 @@
+/************************************************************
+*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*
+*************************************************************/
+#include "singa/singa_config.h"
+#ifdef USE_OPENCV
+#ifndef SINGA_EXAMPLES_IMAGENET_ILSVRC12_H_
+#define SINGA_EXAMPLES_IMAGENET_ILSVRC12_H_
+#include <omp.h>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <string>
+#include <thread>
+#include <vector>
+#include "singa/core/tensor.h"
+#include "singa/io/decoder.h"
+#include "singa/io/encoder.h"
+#include "singa/io/reader.h"
+#include "singa/io/transformer.h"
+#include "singa/io/writer.h"
+#include "singa/proto/io.pb.h"
+#include "singa/utils/timer.h"
+
+using std::string;
+using namespace singa::io;
+namespace singa {
+/// For reading ILSVRC2012 image data as tensors.
+class ILSVRC {
+ public:
+  /// Setup encoder, decoder
+  ILSVRC();
+  ~ILSVRC() {
+    if (encoder != nullptr) delete encoder;
+    if (decoder != nullptr) delete decoder;
+    if (transformer != nullptr) delete transformer;
+    if (reader != nullptr) {
+      reader->Close();
+      delete reader;
+    }
+    if (writer != nullptr) {
+      writer->Close();
+      delete writer;
+    }
+  }
+  /// Create binary files for training data
+  /// train_image_list: list file of training images
+  /// train_image_folder: folder where stores original training images
+  /// train_bin_folder: folder to store binary files
+  /// train_file_size: number of images that are contain in one binary file
+  void CreateTrainData(string train_image_list, string train_image_folder,
+                       string train_bin_folder, size_t train_file_size);
+  /// Create binary files for test data
+  /// train_image_list: list file of test images
+  /// train_image_folder: folder where saves original test images
+  /// train_bin_folder: folder to save binary files
+  void CreateTestData(string test_image_list, string test_image_folder,
+                      string test_bin_folder);
+  /// Load data from a binary file,  return <images, labels> pair
+  /// suppose the data will be loaded file by file.
+  /// flag: kTrain or kTest
+  /// file: binary file which stores the images
+  /// read_size: number of images to be loaded
+  /// offset: offset in the file
+  /// n_read: number of images which are read
+  size_t LoadData(int flag, string file, size_t read_size, Tensor *x, Tensor *y,
+                  size_t *n_read, int nthreads);
+  /// A wrapper method to spawn a thread to execute LoadData() method.
+  std::thread AsyncLoadData(int flag, string file, size_t read_size, Tensor *x,
+                            Tensor *y, size_t *n_read, int nthreads);
+
+  void DecodeTransform(int flag, int thid, int nthreads,
+                       vector<string *> images, Tensor *x, Tensor *y);
+  /// A wrapper method to spawn a thread to execute Decodetransform() method.
+  std::thread AsyncDecodeTransform(int flag, int thid, int nthreads,
+                                   vector<string *> images, Tensor *x,
+                                   Tensor *y);
+
+  /// Read mean from path
+  void ReadMean(string path);
+
+ protected:
+  /// Read one image at path, resize the image
+  Tensor ReadImage(string path);
+  /// Write buff to the file in kCreate/kAppend mode
+  void Write(string outfile, singa::io::Mode mode);
+  void WriteMean(Tensor &mean, string path);
+
+ private:
+  /// size for resizing
+  const size_t kImageSize = 256;
+  const size_t kImageNBytes = 3 * kImageSize * kImageSize;
+  /// size for cropping
+  const size_t kCropSize = 227;
+  Tensor mean;
+  string last_read_file = "";
+
+  JPGEncoder *encoder = nullptr;
+  JPGDecoder *decoder = nullptr;
+  ImageTransformer *transformer = nullptr;
+  BinFileReader *reader = nullptr;
+  BinFileWriter *writer = nullptr;
+};
+
+ILSVRC::ILSVRC() {
+  EncoderConf en_conf;
+  en_conf.set_image_dim_order("CHW");
+  encoder = new JPGEncoder();
+  encoder->Setup(en_conf);
+
+  DecoderConf de_conf;
+  de_conf.set_image_dim_order("CHW");
+  decoder = new JPGDecoder();
+  decoder->Setup(de_conf);
+
+  TransformerConf trans_conf;
+  trans_conf.add_crop_shape(kCropSize);
+  trans_conf.add_crop_shape(kCropSize);
+  trans_conf.set_image_dim_order("CHW");
+  trans_conf.set_horizontal_mirror(true);
+  transformer = new ImageTransformer();
+  transformer->Setup(trans_conf);
+}
+
+Tensor ILSVRC::ReadImage(string path) {
+  cv::Mat mat = cv::imread(path, CV_LOAD_IMAGE_COLOR);
+  CHECK(mat.data != NULL) << "OpenCV load image fail: " << path;
+  cv::Size size(kImageSize, kImageSize);
+  cv::Mat resized;
+  cv::resize(mat, resized, size);
+  CHECK_EQ((size_t)resized.size().height, kImageSize);
+  CHECK_EQ((size_t)resized.size().width, kImageSize);
+  // dimension_order: CHW
+  Shape shape{(size_t)resized.channels(), (size_t)resized.rows,
+              (size_t)resized.cols};
+  Tensor image(shape, singa::kUChar);
+  unsigned char *data = new unsigned char[kImageNBytes];
+  for (int i = 0; i < resized.rows; i++)
+    for (int j = 0; j < resized.cols; j++)
+      for (int k = 0; k < resized.channels(); k++)
+        data[k * kImageSize * kImageSize + i * kImageSize + j] =
+            resized.at<cv::Vec3b>(i, j)[k];
+  image.CopyDataFromHostPtr<unsigned char>(data, kImageNBytes);
+  delete[] data;
+
+  return image;
+}
+
+void ILSVRC::WriteMean(Tensor &mean, string path) {
+  Tensor mean_lb(Shape{1}, kInt);
+  std::vector<Tensor> input;
+  input.push_back(mean);
+  input.push_back(mean_lb);
+  BinFileWriter bfwriter;
+  bfwriter.Open(path, kCreate);
+  bfwriter.Write(path, encoder->Encode(input));
+  bfwriter.Flush();
+  bfwriter.Close();
+}
+
+void ILSVRC::CreateTrainData(string image_list, string input_folder,
+                             string output_folder, size_t file_size = 12800) {
+  std::vector<std::pair<string, int>> file_list;
+  size_t *sum = new size_t[kImageNBytes];
+  for (size_t i = 0; i < kImageNBytes; i++) sum[i] = 0u;
+  string image_file_name;
+  int label;
+  string outfile;
+  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
+  while (image_list_file >> image_file_name >> label)
+    file_list.push_back(std::make_pair(image_file_name, label));
+  LOG(INFO) << "Data Shuffling";
+  std::shuffle(file_list.begin(), file_list.end(),
+               std::default_random_engine());
+  LOG(INFO) << "Total number of training images is " << file_list.size();
+  size_t num_train_images = file_list.size();
+  if (file_size == 0) file_size = num_train_images;
+  for (size_t imageid = 0; imageid < num_train_images; imageid++) {
+    string path = input_folder + "/" + file_list[imageid].first;
+    Tensor image = ReadImage(path);
+    auto image_data = image.data<unsigned char>();
+    for (size_t i = 0; i < kImageNBytes; i++)
+      sum[i] += static_cast<size_t>(image_data[i]);
+    label = file_list[imageid].second;
+    Tensor lb(Shape{1}, kInt);
+    lb.CopyDataFromHostPtr<int>(&label, 1);
+    std::vector<Tensor> input;
+    input.push_back(image);
+    input.push_back(lb);
+    string encoded_str = encoder->Encode(input);
+    if (writer == nullptr) {
+      writer = new BinFileWriter();
+      outfile = output_folder + "/train" +
+                std::to_string(imageid / file_size + 1) + ".bin";
+      writer->Open(outfile, kCreate);
+    }
+    writer->Write(path, encoded_str);
+    if ((imageid + 1) % file_size == 0) {
+      writer->Flush();
+      writer->Close();
+      LOG(INFO) << "Write " << file_size << " images into " << outfile;
+      delete writer;
+      writer = nullptr;
+    }
+  }
+  if (writer != nullptr) {
+    writer->Flush();
+    writer->Close();
+    LOG(INFO) << "Write " << num_train_images % file_size << " images into "
+              << outfile;
+    delete writer;
+    writer = nullptr;
+  }
+  size_t num_file =
+      num_train_images / file_size + ((num_train_images % file_size) ? 1 : 0);
+  LOG(INFO) << "Write " << num_train_images << " images into " << num_file
+            << " binary files";
+  Tensor mean = Tensor(Shape{3, kImageSize, kImageSize}, kUChar);
+  unsigned char *mean_data = new unsigned char[kImageNBytes];
+  for (size_t i = 0; i < kImageNBytes; i++)
+    mean_data[i] = static_cast<unsigned char>(sum[i] / num_train_images);
+  mean.CopyDataFromHostPtr<unsigned char>(mean_data, kImageNBytes);
+  string mean_path = output_folder + "/mean.bin";
+  WriteMean(mean, mean_path);
+  delete[] mean_data;
+  delete[] sum;
+}
+
+void ILSVRC::CreateTestData(string image_list, string input_folder,
+                            string output_folder) {
+  std::vector<std::pair<string, int>> file_list;
+  string image_file_name;
+  string outfile = output_folder + "/test.bin";
+  int label;
+  std::ifstream image_list_file(image_list.c_str(), std::ios::in);
+  while (image_list_file >> image_file_name >> label)
+    file_list.push_back(std::make_pair(image_file_name, label));
+  LOG(INFO) << "Total number of test images is " << file_list.size();
+  size_t num_test_images = file_list.size();
+  for (size_t imageid = 0; imageid < num_test_images; imageid++) {
+    string path = input_folder + "/" + file_list[imageid].first;
+    Tensor image = ReadImage(path);
+    label = file_list[imageid].second;
+    Tensor lb(Shape{1}, singa::kInt);
+    lb.CopyDataFromHostPtr<int>(&label, 1);
+    std::vector<Tensor> input;
+    input.push_back(image);
+    input.push_back(lb);
+    string encoded_str = encoder->Encode(input);
+    if (writer == nullptr) {
+      writer = new BinFileWriter();
+      writer->Open(outfile, kCreate);
+    }
+    writer->Write(path, encoded_str);
+  }
+  if (writer != nullptr) {
+    writer->Flush();
+    writer->Close();
+    delete writer;
+    writer = nullptr;
+  }
+  LOG(INFO) << "Write " << num_test_images << " images into " << outfile;
+}
+
+void ILSVRC::ReadMean(string path) {
+  BinFileReader bfreader;
+  string key, value;
+  bfreader.Open(path);
+  bfreader.Read(&key, &value);
+  auto ret = decoder->Decode(value);
+  bfreader.Close();
+  mean = ret[0];
+}
+
+std::thread ILSVRC::AsyncLoadData(int flag, string file, size_t read_size,
+                                  Tensor *x, Tensor *y, size_t *n_read,
+                                  int nthreads) {
+  return std::thread(
+      [=]() { LoadData(flag, file, read_size, x, y, n_read, nthreads); });
+}
+
+size_t ILSVRC::LoadData(int flag, string file, size_t read_size, Tensor *x,
+                        Tensor *y, size_t *n_read, int nthreads) {
+  x->Reshape(Shape{read_size, 3, kCropSize, kCropSize});
+  y->AsType(kInt);
+  y->Reshape(Shape{read_size});
+  if (file != last_read_file) {
+    if (reader != nullptr) {
+      reader->Close();
+      delete reader;
+      reader = nullptr;
+    }
+    reader = new BinFileReader();
+    reader->Open(file, 100 << 20);
+    last_read_file = file;
+  } else if (reader == nullptr) {
+    reader = new BinFileReader();
+    reader->Open(file, 100 << 20);
+  }
+  vector<string *> images;
+  for (size_t i = 0; i < read_size; i++) {
+    string image_path;
+    string *image = new string();
+    bool ret = reader->Read(&image_path, image);
+    if (ret == false) {
+      reader->Close();
+      delete reader;
+      reader = nullptr;
+      break;
+    }
+    images.push_back(image);
+  }
+  int nimg = images.size();
+  *n_read = nimg;
+
+  vector<std::thread> threads;
+  for (int i = 1; i < nthreads; i++) {
+    threads.push_back(AsyncDecodeTransform(flag, i, nthreads, images, x, y));
+  }
+  DecodeTransform(flag, 0, nthreads, images, x, y);
+  for (size_t i = 0; i < threads.size(); i++) threads[i].join();
+  for (int k = 0; k < nimg; k++) delete images.at(k);
+  return nimg;
+}
+
+std::thread ILSVRC::AsyncDecodeTransform(int flag, int thid, int nthreads,
+                                         vector<string *> images, Tensor *x,
+                                         Tensor *y) {
+  return std::thread(
+      [=]() { DecodeTransform(flag, thid, nthreads, images, x, y); });
+}
+
+void ILSVRC::DecodeTransform(int flag, int thid, int nthreads,
+                             vector<string *> images, Tensor *x, Tensor *y) {
+  int nimg = images.size();
+  int start = nimg / nthreads * thid;
+  int end = start + nimg / nthreads;
+  for (int k = start; k < end; k++) {
+    std::vector<Tensor> pair = decoder->Decode(*images.at(k));
+    auto tmp_image = pair[0] - mean;
+    Tensor aug_image = transformer->Apply(flag, tmp_image);
+    CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
+    CopyDataToFrom(y, pair[1], 1, k);
+  }
+  if (thid == 0) {
+    for (int k = nimg / nthreads * nthreads; k < nimg; k++) {
+      std::vector<Tensor> pair = decoder->Decode(*images.at(k));
+      auto tmp_image = pair[0] - mean;
+      Tensor aug_image = transformer->Apply(flag, tmp_image);
+      CopyDataToFrom(x, aug_image, aug_image.Size(), k * aug_image.Size());
+      CopyDataToFrom(y, pair[1], 1, k);
+    }
+  }
+}
+}  // namespace singa
+
+#endif  // SINGA_EXAMPLES_IMAGENET_ILSVRC12_H_
+#endif  // USE_OPENCV

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/alexnet/run.sh
----------------------------------------------------------------------
diff --git a/examples/imagenet/alexnet/run.sh b/examples/imagenet/alexnet/run.sh
new file mode 100755
index 0000000..6277d23
--- /dev/null
+++ b/examples/imagenet/alexnet/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env sh
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+
+../../build/bin/imagenet -epoch 90 -lr 0.01 -batchsize 256 -filesize 1280 -ntrain 1281167 -ntest 50000 \
+  -data "imagenet_data" -pfreq 100 -nthreads 12

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/create_data.sh
----------------------------------------------------------------------
diff --git a/examples/imagenet/create_data.sh b/examples/imagenet/create_data.sh
deleted file mode 100755
index 4c2c034..0000000
--- a/examples/imagenet/create_data.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env sh
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# 
-
-../../build/bin/createdata -trainlist "imagenet/label/train.txt" -trainfolder "imagenet/ILSVRC2012_img_train" \
-  -testlist "imagenet/label/val.txt" -testfolder "imagenet/ILSVRC2012_img_val" -outdata "imagenet_data" -filesize 1280

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/googlenet/README.md
----------------------------------------------------------------------
diff --git a/examples/imagenet/googlenet/README.md b/examples/imagenet/googlenet/README.md
new file mode 100644
index 0000000..e597fc6
--- /dev/null
+++ b/examples/imagenet/googlenet/README.md
@@ -0,0 +1,66 @@
+---
+name: GoogleNet on ImageNet
+SINGA version: 1.0.1
+SINGA commit: 8c990f7da2de220e8a012c6a8ecc897dc7532744
+parameter_url: https://s3-ap-southeast-1.amazonaws.com/dlfile/bvlc_googlenet.tar.gz
+parameter_sha1: 0a88e8948b1abca3badfd8d090d6be03f8d7655d
+license: unrestricted https://github.com/BVLC/caffe/tree/master/models/bvlc_googlenet
+---
+
+# Image Classification using GoogleNet
+
+
+In this example, we convert GoogleNet trained on Caffe to SINGA for image classification.
+
+## Instructions
+
+* Download the parameter checkpoint file into this folder
+
+        $ wget https://s3-ap-southeast-1.amazonaws.com/dlfile/bvlc_googlenet.tar.gz
+        $ tar xvf bvlc_googlenet.tar.gz
+
+* Run the program
+
+        # use cpu
+        $ python serve.py -C &
+        # use gpu
+        $ python serve.py &
+
+* Submit images for classification
+
+        $ curl -i -F image=@image1.jpg http://localhost:9999/api
+        $ curl -i -F image=@image2.jpg http://localhost:9999/api
+        $ curl -i -F image=@image3.jpg http://localhost:9999/api
+
+image1.jpg, image2.jpg and image3.jpg should be downloaded before executing the above commands.
+
+## Details
+
+We first extract the parameter values from [Caffe's checkpoint file](http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel) into a pickle version
+After downloading the checkpoint file into `caffe_root/python` folder, run the following script
+
+    # to be executed within caffe_root/python folder
+    import caffe
+    import numpy as np
+    import cPickle as pickle
+
+    model_def = '../models/bvlc_googlenet/deploy.prototxt'
+    weight = 'bvlc_googlenet.caffemodel'  # must be downloaded at first
+    net = caffe.Net(model_def, weight, caffe.TEST)
+
+    params = {}
+    for layer_name in net.params.keys():
+        weights=np.copy(net.params[layer_name][0].data)
+        bias=np.copy(net.params[layer_name][1].data)
+        params[layer_name+'_weight']=weights
+        params[layer_name+'_bias']=bias
+        print layer_name, weights.shape, bias.shape
+
+    with open('bvlc_googlenet.pickle', 'wb') as fd:
+        pickle.dump(params, fd)
+
+Then we construct the GoogleNet using SINGA's FeedForwardNet structure.
+Note that we added a EndPadding layer to resolve the issue from discrepancy
+of the rounding strategy of the pooling layer between Caffe (ceil) and cuDNN (floor).
+Only the MaxPooling layers outside inception blocks have this problem.
+Refer to [this](http://joelouismarino.github.io/blog_posts/blog_googlenet_keras.html) for more detials.

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/googlenet/serve.py
----------------------------------------------------------------------
diff --git a/examples/imagenet/googlenet/serve.py b/examples/imagenet/googlenet/serve.py
new file mode 100644
index 0000000..57e005d
--- /dev/null
+++ b/examples/imagenet/googlenet/serve.py
@@ -0,0 +1,240 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+''' This model is created following Caffe implementation of GoogleNet
+https://github.com/BVLC/caffe/blob/master/models/bvlc_googlenet/
+'''
+import os
+import sys
+import time
+import numpy as np
+import threading
+import traceback
+from argparse import ArgumentParser
+from scipy.misc import imread, imresize
+import numpy as np
+
+from singa.layer import Layer, Conv2D, Activation, MaxPooling2D, AvgPooling2D,\
+        Split, Concat, LRN, Dropout, Flatten, Dense
+from singa import layer
+from singa import net as ffnet
+from singa import device
+from singa import tensor
+from rafiki.agent import Agent, MsgType
+
+
+def add_to_tuple(x):
+    '''return a tuple with the last two values incremented by 1'''
+    if len(x) == 3:
+        return (x[0], x[1] + 1, x[2] + 1)
+    else:
+        return (x[0], x[1], x[2] + 1, x[3] + 1)
+
+class EndPadding(Layer):
+    '''Pad the end of the spatial axis with 1 row and 1 column of zeros.
+
+    This layer is inserted before the pooling layers outside the inception
+    block. We need such a layer because Caffe (ceil) and cuDNN (floor) have
+    different rounding strategies for the pooling layer.
+    http://joelouismarino.github.io/blog_posts/blog_googlenet_keras.html
+    '''
+    def __init__(self, name, input_sample_shape=None):
+        super(EndPadding, self).__init__(name)
+        if input_sample_shape is not None:
+            assert len(input_sample_shape) == 3, 'input must has 4 dims'
+            self.output_sample_shape = add_to_tuple(input_sample_shape)
+
+    def get_output_sample_shape(self):
+        return self.output_sample_shape
+
+    def setup(self, input_sample_shape):
+        assert len(input_sample_shape) == 3, 'input must has 4 dims'
+        self.output_sample_shape = add_to_tuple(input_sample_shape)
+        self.has_setup = True
+
+    def forward(self, flag, x):
+        '''pad zeros'''
+        tmp = tensor.to_numpy(x)
+        shape = add_to_tuple(x.shape)
+        ret = np.zeros(shape)
+        ret[:,:,:-1, :-1] = tmp
+        y = tensor.from_numpy(ret)
+        y.to_device(x.device)
+        return y
+
+    def backward(self, falg, dy):
+        '''remove paddings'''
+        tmp = tensor.to_numpy(dy)
+        dx = tensor.from_numpy(tmp[:,:,:-1,:-1])
+        dx.to_device(dy.device)
+        return dx, []
+
+# b_specs = {'init': 'constant', 'value': 0, 'lr_mult': 2, 'decay_mult': 0}
+
+def conv(net, src, name, num, kernel, stride=1, pad=0, suffix=''):
+    net.add(Conv2D('%s/%s' % (name, suffix), num, kernel, stride, pad=pad), src)
+    return net.add(Activation('%s/relue_%s' % (name, suffix)))
+
+def pool(net, src, name, kernel, stride):
+    net.add(EndPadding('%s/pad' % name), src)
+    ret = net.add(MaxPooling2D('%s' % name, 3, 2, pad=0))
+    return ret
+
+def inception(net, src, name, nb1x1, nb3x3r, nb3x3, nb5x5r, nb5x5, nbproj):
+    split = net.add(Split('%s/split' % name, 4), src)
+
+    c1x1 = conv(net, split, name, nb1x1, 1, suffix='1x1')
+
+    c3x3r = conv(net, split, name, nb3x3r, 1, suffix='3x3_reduce')
+    c3x3 = conv(net, c3x3r, name, nb3x3, 3, pad=1, suffix='3x3')
+
+    c5x5r = conv(net, split, name, nb5x5r, 1, suffix='5x5_reduce')
+    c5x5 = conv(net, c5x5r, name, nb5x5, 5, pad=2, suffix='5x5')
+
+    pool = net.add(MaxPooling2D('%s/pool' % name, 3, 1, pad=1), split)
+    cproj = conv(net, pool, name, nbproj, 1, suffix='pool_proj')
+
+    return net.add(Concat('%s/output' % name, 1), [c1x1, c3x3, c5x5, cproj])
+
+
+def create_net(shape, weight_path='bvlc_googlenet.pickle'):
+    net = ffnet.FeedForwardNet()
+    net.add(Conv2D('conv1/7x7_s2', 64, 7, 2, pad=3, input_sample_shape=shape))
+    c1 = net.add(Activation('conv1/relu_7x7'))
+    pool1 = pool(net, c1, 'pool1/3x3_s2', 3, 2)
+    norm1 = net.add(LRN('pool1/norm1', 5, 0.0001, 0.75))
+    c3x3r = conv(net, norm1 , 'conv2', 64, 1, suffix='3x3_reduce')
+    c3x3 = conv(net, c3x3r, 'conv2', 192, 3, pad=1, suffix='3x3')
+    norm2 = net.add(LRN('conv2/norm2', 5, 0.0001, 0.75))
+    pool2 = pool(net, norm2, 'pool2/3x3_s2', 3, 2)
+
+    i3a=inception(net, pool2, 'inception_3a', 64, 96, 128, 16, 32, 32)
+    i3b=inception(net, i3a, 'inception_3b', 128, 128, 192, 32, 96, 64)
+    pool3=pool(net, i3b, 'pool3/3x3_s2', 3, 2)
+    i4a=inception(net, pool3, 'inception_4a', 192, 96, 208, 16, 48, 64)
+    i4b=inception(net, i4a, 'inception_4b', 160, 112, 224, 24, 64, 64)
+    i4c=inception(net, i4b, 'inception_4c', 128, 128, 256, 24, 64, 64)
+    i4d=inception(net, i4c, 'inception_4d', 112, 144, 288, 32, 64, 64)
+    i4e=inception(net, i4d, 'inception_4e', 256, 160, 320, 32, 128, 128)
+    pool4=pool(net, i4e,'pool4/3x3_s2', 3, 2)
+    i5a=inception(net, pool4, 'inception_5a', 256, 160, 320, 32, 128, 128)
+    i5b=inception(net, i5a, 'inception_5b', 384, 192, 384, 48, 128, 128)
+    pool5=net.add(AvgPooling2D('pool5/7x7_s1', 7, 1, pad=0))
+    drop5=net.add(Dropout('drop', 0.4))
+    flat=net.add(Flatten('flat'))
+    dense=net.add(Dense('loss3/classifier', 1000))
+    # prob=net.add(Softmax('softmax'))
+
+    net.load(weight_path, use_pickle=True)
+    print 'total num of params %d' % (len(net.param_names()))
+    # SINGA and Caffe have different layout for the weight matrix of the dense
+    # layer
+    for key, val in zip(net.param_names(), net.param_values()):
+        # print key
+        if key == 'loss3/classifier_weight':
+            tmp = tensor.to_numpy(val)
+            tmp = tmp.reshape(tmp.shape[::-1])
+            val.copy_from_numpy(np.transpose(tmp))
+    return net
+
+
+def serve(agent, use_cpu, parameter_file, topk=5):
+    if use_cpu:
+        print 'running with cpu'
+        dev = device.get_default_device()
+        layer.engine = 'singacpp'
+    else:
+        print "runing with gpu"
+        dev = device.create_cuda_gpu()
+    agent = agent
+
+    print 'Start intialization............'
+    net = create_net((3, 224, 224), parameter_file)
+    net.to_device(dev)
+    print 'End intialization............'
+
+    labels = np.loadtxt('synset_words.txt', str, delimiter='\t ')
+    while True:
+        key, val = agent.pull()
+        if key is None:
+            time.sleep(0.1)
+            continue
+        msg_type = MsgType.parse(key)
+        if msg_type.is_request():
+            try:
+                response = ""
+                img = imread(val['image'], mode='RGB').astype(np.float32)
+                height,width = img.shape[:2]
+                img[:, :, 0] -= 123.68
+                img[:, :, 1] -= 116.779
+                img[:, :, 2] -= 103.939
+                img[:,:,[0,1,2]] = img[:,:,[2,1,0]]
+                img = img.transpose((2, 0, 1))
+                img = img[:,(height-224)//2:(height+224)//2,(width-224)//2:(width+224)//2]
+                images = np.expand_dims(img, axis=0)
+
+                x = tensor.from_numpy(images.astype(np.float32))
+                x.to_device(dev)
+                y = net.predict(x)
+                prob = np.average(tensor.to_numpy(y), 0)
+                # sort and reverse
+                idx = np.argsort(-prob)[0:topk]
+                for i in idx:
+                    response += "%s:%s<br/>" % (labels[i], prob[i])
+            except:
+                traceback.print_exc()
+                response = "Sorry, system error during prediction."
+            agent.push(MsgType.kResponse, response)
+        elif MsgType.kCommandStop.equal(msg_type):
+                print 'get stop command'
+                agent.push(MsgType.kStatus, "success")
+                break
+        else:
+            print 'get unsupported message %s' % str(msg_type)
+            agent.push(MsgType.kStatus, "Unknown command")
+            break
+        # while loop
+    print "server stop"
+
+
+def main():
+    try:
+        # Setup argument parser
+        parser = ArgumentParser(description="GooleNet for image classification")
+        parser.add_argument("-p", "--port", default=9999, help="listen port")
+        parser.add_argument("-C", "--use_cpu", action="store_true")
+        parser.add_argument("--parameter_file", default="bvlc_googlenet.pickle",
+                help="relative path")
+
+        # Process arguments
+        args = parser.parse_args()
+        port = args.port
+
+        # start to train
+        agent = Agent(port)
+        serve(agent, args.use_cpu, args.parameter_file)
+        agent.stop()
+
+    except SystemExit:
+        return
+    except:
+        traceback.print_exc()
+        sys.stderr.write("  for help use --help \n\n")
+        return 2
+
+
+if __name__ == '__main__':
+    main()

http://git-wip-us.apache.org/repos/asf/incubator-singa/blob/d190fa89/examples/imagenet/ilsvrc12.cc
----------------------------------------------------------------------
diff --git a/examples/imagenet/ilsvrc12.cc b/examples/imagenet/ilsvrc12.cc
deleted file mode 100644
index c9e6d2f..0000000
--- a/examples/imagenet/ilsvrc12.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/************************************************************
-*
-* Licensed to the Apache Software Foundation (ASF) under one
-* or more contributor license agreements.  See the NOTICE file
-* distributed with this work for additional information
-* regarding copyright ownership.  The ASF licenses this file
-* to you under the Apache License, Version 2.0 (the
-* "License"); you may not use this file except in compliance
-* with the License.  You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing,
-* software distributed under the License is distributed on an
-* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-* KIND, either express or implied.  See the License for the
-* specific language governing permissions and limitations
-* under the License.
-*
-*************************************************************/
-#include "singa/singa_config.h"
-#ifdef USE_OPENCV
-#include "ilsvrc12.h"
-#include "singa/utils/channel.h"
-#include "singa/utils/string.h"
-int main(int argc, char **argv) {
-  int pos = singa::ArgPos(argc, argv, "-h");
-  if (pos != -1) {
-    std::cout << "Usage:\n"
-              << "\t-trainlist <file>: the file of training list;\n"
-              << "\t-trainfolder <folder>: the folder of training images;\n"
-              << "\t-testlist <file>: the file of test list;\n"
-              << "\t-testfolder <floder>: the folder of test images;\n"
-              << "\t-outdata <folder>: the folder to save output files;\n"
-              << "\t-filesize <int>: number of training images that stores in "
-                 "each binary file.\n";
-    return 0;
-  }
-  pos = singa::ArgPos(argc, argv, "-trainlist");
-  string train_image_list = "imagenet/label/train.txt";
-  if (pos != -1) train_image_list = argv[pos + 1];
-
-  pos = singa::ArgPos(argc, argv, "-trainfolder");
-  string train_image_folder = "imagenet/ILSVRC2012_img_train";
-  if (pos != -1) train_image_folder = argv[pos + 1];
-
-  pos = singa::ArgPos(argc, argv, "-testlist");
-  string test_image_list = "imagenet/label/val.txt";
-  if (pos != -1) test_image_list = argv[pos + 1];
-
-  pos = singa::ArgPos(argc, argv, "-testfolder");
-  string test_image_folder = "imagenet/ILSVRC2012_img_val";
-  if (pos != -1) test_image_folder = argv[pos + 1];
-
-  pos = singa::ArgPos(argc, argv, "-outdata");
-  string bin_folder = "imagenet_data";
-  if (pos != -1) bin_folder = argv[pos + 1];
-
-  pos = singa::ArgPos(argc, argv, "-filesize");
-  size_t train_file_size = 1280;
-  if (pos != -1) train_file_size = atoi(argv[pos + 1]);
-  singa::ILSVRC data;
-  LOG(INFO) << "Creating training and test data...";
-  data.CreateTrainData(train_image_list, train_image_folder, bin_folder,
-                       train_file_size);
-  data.CreateTestData(test_image_list, test_image_folder, bin_folder);
-  LOG(INFO) << "Data created!";
-  return 0;
-}
-#endif  // USE_OPENCV