You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by su...@apache.org on 2018/09/28 18:31:37 UTC

[2/2] hadoop git commit: YARN-8800. Updated documentation of Submarine with latest examples. Contributed by Wangda Tan.

YARN-8800. Updated documentation of Submarine with latest examples. Contributed by Wangda Tan.


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/19ad5be6
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/19ad5be6
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/19ad5be6

Branch: refs/heads/trunk
Commit: 19ad5be6517765b31b4afaccbaadc8268627c568
Parents: 72891fc
Author: Sunil G <su...@apache.org>
Authored: Sat Sep 29 00:01:04 2018 +0530
Committer: Sunil G <su...@apache.org>
Committed: Sat Sep 29 00:01:04 2018 +0530

----------------------------------------------------------------------
 hadoop-project/src/site/site.xml                |   6 +
 .../hadoop-yarn-submarine/README.md             |   6 +-
 .../base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0   |  69 +++
 .../base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0   |  67 +++
 .../src/main/docker/build-all.sh                |  32 ++
 .../ubuntu-16.04/Dockerfile.cpu.tf_1.8.0        |  22 +
 .../ubuntu-16.04/Dockerfile.gpu.tf_1.8.0        |  22 +
 .../cifar10_estimator_tf_1.8.0/README.md        | 542 ++++++++++++++++++
 .../cifar10_estimator_tf_1.8.0/cifar10.py       | 113 ++++
 .../cifar10_estimator_tf_1.8.0/cifar10_main.py  | 521 +++++++++++++++++
 .../cifar10_estimator_tf_1.8.0/cifar10_model.py |  80 +++
 .../cifar10_estimator_tf_1.8.0/cifar10_utils.py | 154 +++++
 .../generate_cifar10_tfrecords.py               | 114 ++++
 .../cifar10_estimator_tf_1.8.0/model_base.py    | 219 +++++++
 .../zeppelin-notebook-example/Dockerfile.gpu    |  75 +++
 .../zeppelin-notebook-example/run_container.sh  |  22 +
 .../docker/zeppelin-notebook-example/shiro.ini  | 120 ++++
 .../zeppelin-notebook-example/zeppelin-site.xml | 569 +++++++++++++++++++
 .../src/site/DeveloperGuide.md                  |  26 -
 .../src/site/QuickStart.md                      | 134 -----
 .../src/site/markdown/DeveloperGuide.md         |  24 +
 .../src/site/markdown/Examples.md               |  21 +
 .../src/site/markdown/Index.md                  |  42 ++
 .../src/site/markdown/QuickStart.md             | 174 ++++++
 .../markdown/RunningDistributedCifar10TFJobs.md | 162 ++++++
 .../src/site/markdown/RunningZeppelinOnYARN.md  |  37 ++
 .../src/site/markdown/WriteDockerfile.md        | 117 ++++
 .../src/site/resources/css/site.css             |  29 +
 .../src/site/resources/images/job-logs-ui.png   | Bin 0 -> 229944 bytes
 .../images/multiple-tensorboard-jobs.png        | Bin 0 -> 184717 bytes
 .../resources/images/tensorboard-service.png    | Bin 0 -> 107567 bytes
 .../hadoop-yarn-submarine/src/site/site.xml     |  28 +
 32 files changed, 3385 insertions(+), 162 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-project/src/site/site.xml
----------------------------------------------------------------------
diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml
index b40dbfc..2b6058e 100644
--- a/hadoop-project/src/site/site.xml
+++ b/hadoop-project/src/site/site.xml
@@ -180,6 +180,12 @@
       <item name="System Services" href="hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html"/>
     </menu>
 
+    <menu name="Submarine" inherit="top">
+      <item name="Index" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Index.html"/>
+      <item name="QuickStart" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/QuickStart.html"/>
+      <item name="Examples" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Examples.html"/>
+    </menu>
+
     <menu name="Hadoop Compatible File Systems" inherit="top">
       <item name="Aliyun OSS" href="hadoop-aliyun/tools/hadoop-aliyun/index.html"/>
       <item name="Amazon S3" href="hadoop-aws/tools/hadoop-aws/index.html"/>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
index 3e04730..cb2e2da 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
@@ -48,6 +48,8 @@ Goals of Submarine:
 - Support launch tensorboard for training jobs if user specified.
 - Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
 
-Please jump to [QuickStart](src/site/QuickStart.md) guide to quickly understand how to use this framework.
+Please jump to [QuickStart](src/site/markdown/QuickStart.md) guide to quickly understand how to use this framework.
 
-If you're a developer, please find [Developer](src/site/DeveloperGuide.md) guide for more details.
+Please jump to [Examples](src/site/markdown/Examples.md) to try other examples like running Distributed Tensorflow Training for CIFAR 10.
+
+If you're a developer, please find [Developer](src/site/markdown/DeveloperGuide.md) guide for more details.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
new file mode 100644
index 0000000..f2446a7
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM ubuntu:16.04
+
+LABEL maintainer="Craig Citro <cr...@google.com>"
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        curl \
+        libfreetype6-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
+        ipykernel \
+        jupyter \
+        matplotlib \
+        numpy \
+        pandas \
+        scipy \
+        sklearn \
+        && \
+    python -m ipykernel.kernelspec
+
+# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
+# These lines will be edited automatically by parameterized_docker_build.sh. #
+# COPY _PIP_FILE_ /
+# RUN pip --no-cache-dir install /_PIP_FILE_
+# RUN rm -f /_PIP_FILE_
+
+# Install TensorFlow CPU version from central repo
+RUN pip --no-cache-dir install \
+    http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+
+RUN apt-get update && apt-get install git -y
+
+RUN apt-get update && apt-get install -y openjdk-8-jdk wget
+RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
+RUN tar zxf hadoop-3.1.1.tar.gz
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
new file mode 100644
index 0000000..dee6e19
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
+        build-essential \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-9-0 \
+        cuda-cufft-9-0 \
+        cuda-curand-9-0 \
+        cuda-cusolver-9-0 \
+        cuda-cusparse-9-0 \
+        curl \
+        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libfreetype6-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        pkg-config \
+        python \
+        python-dev \
+        rsync \
+        software-properties-common \
+        unzip \
+        && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+
+RUN pip --no-cache-dir install \
+        Pillow \
+        h5py \
+        ipykernel \
+        jupyter \
+        matplotlib \
+        numpy \
+        pandas \
+        scipy \
+        sklearn \
+        && \
+    python -m ipykernel.kernelspec
+
+# Install TensorFlow GPU version.
+RUN pip --no-cache-dir install \
+    http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+RUN apt-get update && apt-get install git -y
+
+RUN apt-get update && apt-get install -y openjdk-8-jdk wget
+RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
+RUN tar zxf hadoop-3.1.0.tar.gz
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh
new file mode 100755
index 0000000..ad3a935
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Building base images"
+
+set -e
+
+cd base/ubuntu-16.04
+
+docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu-base:0.0.1
+docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu-base:0.0.1
+
+echo "Finished building base images"
+
+cd ../../with-cifar10-models/ubuntu-16.04
+
+docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu:0.0.1
+docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu:0.0.1

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
new file mode 100644
index 0000000..1087d61
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM tf-1.8.0-cpu-base:0.0.1
+
+# Include models
+RUN mkdir /test
+ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
+RUN chown -R nobody /test
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
new file mode 100644
index 0000000..d1f829f
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM tf-1.8.0-gpu-base:0.0.1
+
+# Include models
+RUN mkdir /test
+ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
+RUN chown -R nobody /test
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md
new file mode 100644
index 0000000..5b4ae34
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md
@@ -0,0 +1,542 @@
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+(Copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator)
+
+CIFAR-10 is a common benchmark in machine learning for image recognition.
+
+http://www.cs.toronto.edu/~kriz/cifar.html
+
+Code in this directory focuses on how to use TensorFlow Estimators to train and
+evaluate a CIFAR-10 ResNet model on:
+
+* A single host with one CPU;
+* A single host with multiple GPUs;
+* Multiple hosts with CPU or multiple GPUs;
+
+Before trying to run the model we highly encourage you to read all the README.
+
+## Prerequisite
+
+1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.2.1 or
+later.
+
+2. Download the CIFAR-10 dataset and generate TFRecord files using the provided
+script.  The script and associated command below will download the CIFAR-10
+dataset and then generate a TFRecord for the training, validation, and
+evaluation datasets.
+
+```shell
+python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data
+```
+
+After running the command above, you should see the following files in the
+--data-dir (```ls -R cifar-10-data```):
+
+* train.tfrecords
+* validation.tfrecords
+* eval.tfrecords
+
+
+## Training on a single machine with GPUs or CPU
+
+Run the training on CPU only. After training, it runs the evaluation.
+
+```
+python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
+                       --job-dir=/tmp/cifar10 \
+                       --num-gpus=0 \
+                       --train-steps=1000
+```
+
+Run the model on 2 GPUs using CPU as parameter server. After training, it runs
+the evaluation.
+```
+python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
+                       --job-dir=/tmp/cifar10 \
+                       --num-gpus=2 \
+                       --train-steps=1000
+```
+
+Run the model on 2 GPUs using GPU as parameter server.
+It will run an experiment, which for local setting basically means it will run
+stop training
+a couple of times to perform evaluation.
+
+```
+python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
+                       --job-dir=/tmp/cifar10 \
+                       --variable-strategy GPU \
+                       --num-gpus=2 \
+```
+
+There are more command line flags to play with; run
+`python cifar10_main.py --help` for details.
+
+## Run distributed training
+
+### (Optional) Running on Google Cloud Machine Learning Engine
+
+This example can be run on Google Cloud Machine Learning Engine (ML Engine),
+which will configure the environment and take care of running workers,
+parameters servers, and masters in a fault tolerant way.
+
+To install the command line tool, and set up a project and billing, see the
+quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line).
+
+You'll also need a Google Cloud Storage bucket for the data. If you followed the
+instructions above, you can just run:
+
+```
+MY_BUCKET=gs://<my-bucket-name>
+gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/
+```
+
+Then run the following command from the `tutorials/image` directory of this
+repository (the parent directory of this README):
+
+```
+gcloud ml-engine jobs submit training cifarmultigpu \
+    --runtime-version 1.2 \
+    --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \
+    --config cifar10_estimator/cmle_config.yaml \
+    --package-path cifar10_estimator/ \
+    --module-name cifar10_estimator.cifar10_main \
+    -- \
+    --data-dir=$MY_BUCKET/cifar-10-data \
+    --num-gpus=4 \
+    --train-steps=1000
+```
+
+
+### Set TF_CONFIG
+
+Considering that you already have multiple hosts configured, all you need is a
+`TF_CONFIG` environment variable on each host. You can set up the hosts manually
+or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for
+instructions about how to set up a Cluster.
+
+The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and
+their task: `master`, `ps` or `worker`.
+
+Here's an example of `TF_CONFIG`.
+
+```python
+cluster = {'master': ['master-ip:8000'],
+           'ps': ['ps-ip:8000'],
+           'worker': ['worker-ip:8000']}
+
+TF_CONFIG = json.dumps(
+  {'cluster': cluster,
+   'task': {'type': master, 'index': 0},
+   'model_dir': 'gs://<bucket_path>/<dir_path>',
+   'environment': 'cloud'
+  })
+```
+
+*Cluster*
+
+A cluster spec, which is basically a dictionary that describes all of the tasks
+in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed).
+
+In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker.
+
+* `ps`: saves the parameters among all workers. All workers can
+   read/write/update the parameters for model via ps. As some models are
+   extremely large the parameters are shared among the ps (each ps stores a
+   subset).
+
+* `worker`: does the training.
+
+* `master`: basically a special worker, it does training, but also restores and
+   saves checkpoints and do evaluation.
+
+*Task*
+
+The Task defines what is the role of the current node, for this example the node
+is the master on index 0 on the cluster spec, the task will be different for
+each node. An example of the `TF_CONFIG` for a worker would be:
+
+```python
+cluster = {'master': ['master-ip:8000'],
+           'ps': ['ps-ip:8000'],
+           'worker': ['worker-ip:8000']}
+
+TF_CONFIG = json.dumps(
+  {'cluster': cluster,
+   'task': {'type': worker, 'index': 0},
+   'model_dir': 'gs://<bucket_path>/<dir_path>',
+   'environment': 'cloud'
+  })
+```
+
+*Model_dir*
+
+This is the path where the master will save the checkpoints, graph and
+TensorBoard files. For a multi host environment you may want to use a
+Distributed File System, Google Storage and DFS are supported.
+
+*Environment*
+
+By the default environment is *local*, for a distributed setting we need to
+change it to *cloud*.
+
+### Running script
+
+Once you have a `TF_CONFIG` configured properly on each host you're ready to run
+on distributed settings.
+
+#### Master
+Run this on master:
+Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
+40000 steps. It will run evaluation a couple of times during training. The
+num_workers arugument is used only to update the learning rate correctly. Make
+sure the model_dir is the same as defined on the TF_CONFIG.
+
+```shell
+python cifar10_main.py --data-dir=gs://path/cifar-10-data \
+                       --job-dir=gs://path/model_dir/ \
+                       --num-gpus=4 \
+                       --train-steps=40000 \
+                       --sync \
+                       --num-workers=2
+```
+
+*Output:*
+
+```shell
+INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
+INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd16fb2be10>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
+gpu_options {
+}
+allow_soft_placement: true
+, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
+  per_process_gpu_memory_fraction: 1.0
+}
+, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
+...
+2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:04.0
+Total memory: 11.17GiB
+Free memory: 11.09GiB
+2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:05.0
+Total memory: 11.17GiB
+Free memory: 11.10GiB
+...
+2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
+INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
+INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1
+INFO:tensorflow:Create CheckpointSaverHook.
+INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0
+2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config:
+intra_op_parallelism_threads: 1
+gpu_options {
+  per_process_gpu_memory_fraction: 1
+}
+allow_soft_placement: true
+
+INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt.
+INFO:tensorflow:loss = 1.20682, step = 1
+INFO:tensorflow:loss = 1.20682, learning_rate = 0.1
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
+INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
+INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
+INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14
+2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0)
+2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0)
+2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0)
+2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0)
+2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0)
+2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0)
+2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0)
+2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0)
+INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023
+INFO:tensorflow:Evaluation [1/100]
+INFO:tensorflow:Evaluation [2/100]
+INFO:tensorflow:Evaluation [3/100]
+INFO:tensorflow:Evaluation [4/100]
+INFO:tensorflow:Evaluation [5/100]
+INFO:tensorflow:Evaluation [6/100]
+INFO:tensorflow:Evaluation [7/100]
+INFO:tensorflow:Evaluation [8/100]
+INFO:tensorflow:Evaluation [9/100]
+INFO:tensorflow:Evaluation [10/100]
+INFO:tensorflow:Evaluation [11/100]
+INFO:tensorflow:Evaluation [12/100]
+INFO:tensorflow:Evaluation [13/100]
+...
+INFO:tensorflow:Evaluation [100/100]
+INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31
+INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425
+```
+
+#### Worker
+
+Run this on worker:
+Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
+40000 steps. It will run evaluation a couple of times during training. Make sure
+the model_dir is the same as defined on the TF_CONFIG.
+
+```shell
+python cifar10_main.py --data-dir=gs://path/cifar-10-data \
+                       --job-dir=gs://path/model_dir/ \
+                       --num-gpus=4 \
+                       --train-steps=40000 \
+                       --sync
+```
+
+*Output:*
+
+```shell
+INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
+INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600,
+'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker',
+'_is_chief': False, '_cluster_spec':
+<tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6918438e10>,
+'_model_dir': 'gs://<path>/model_dir/',
+'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000,
+'_session_config': intra_op_parallelism_threads: 1
+gpu_options {
+}
+allow_soft_placement: true
+, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1,
+'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
+  per_process_gpu_memory_fraction: 1.0
+  }
+...
+2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:04.0
+Total memory: 11.17GiB
+Free memory: 11.09GiB
+2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:05.0
+Total memory: 11.17GiB
+Free memory: 11.10GiB
+...
+2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
+INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
+INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
+INFO:tensorflow:Create CheckpointSaverHook.
+2017-07-31 22:38:04.629150: I
+tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting
+for response from worker: /job:master/replica:0/task:0
+2017-07-31 22:38:09.263492: I
+tensorflow/core/distributed_runtime/master_session.cc:999] Start master
+session cc58f93b1e259b0c with config:
+intra_op_parallelism_threads: 1
+gpu_options {
+per_process_gpu_memory_fraction: 1
+}
+allow_soft_placement: true
+INFO:tensorflow:loss = 5.82382, step = 0
+INFO:tensorflow:loss = 5.82382, learning_rate = 0.8
+INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10
+INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20
+INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30
+INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40
+INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50
+INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60
+INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70
+INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec)
+INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec)
+INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80
+INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90
+INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100
+INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110
+INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120
+INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130
+INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140
+INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150
+INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160
+INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170
+INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
+...
+```
+
+#### PS
+
+Run this on ps:
+The ps will not do training so most of the arguments won't affect the execution
+
+```shell
+python cifar10_main.py --job-dir=gs://path/model_dir/
+```
+
+*Output:*
+
+```shell
+INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
+INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f48f1addf90>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
+gpu_options {
+}
+allow_soft_placement: true
+, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
+  per_process_gpu_memory_fraction: 1.0
+}
+, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
+2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000}
+2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
+2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000}
+2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
+```
+
+## Visualizing results with TensorBoard
+
+When using Estimators you can also visualize your data in TensorBoard, with no
+changes in your code. You can use TensorBoard to visualize your TensorFlow
+graph, plot quantitative metrics about the execution of your graph, and show
+additional data like images that pass through it.
+
+You'll see something similar to this if you "point" TensorBoard to the
+`job dir` parameter you used to train or evaluate your model.
+
+Check TensorBoard during training or after it. Just point TensorBoard to the
+model_dir you chose on the previous step.
+
+```shell
+tensorboard --log-dir="<job dir>"
+```
+
+## Warnings
+
+When runninng `cifar10_main.py` with `--sync` argument you may see an error
+similar to:
+
+```python
+File "cifar10_main.py", line 538, in <module>
+    tf.app.run()
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
+    _sys.exit(main(_sys.argv[:1] + flags_passthrough))
+File "cifar10_main.py", line 518, in main
+    hooks), run_config=config)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run
+    return _execute_schedule(experiment, schedule)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule
+    return task()
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate
+    hooks=self._eval_hooks)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate
+    hooks=hooks)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate
+    name=name)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model
+    features, labels, model_fn_lib.ModeKeys.EVAL)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn
+    features=features, labels=labels, **kwargs)
+File "cifar10_main.py", line 331, in _resnet_model_fn
+    gradvars, global_step=tf.train.get_global_step())
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients
+    variables.global_variables())
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped
+    return _add_should_use_warning(fn(*args, **kwargs))
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning
+    wrapped = TFShouldUseWarningWrapper(x)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__
+    stack = [s.strip() for s in traceback.format_stack()]
+```
+
+This should not affect your training, and should be fixed on the next releases.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py
new file mode 100644
index 0000000..6903e8d
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR-10 data set.
+
+See http://www.cs.toronto.edu/~kriz/cifar.html.
+"""
+import os
+
+import tensorflow as tf
+
+HEIGHT = 32
+WIDTH = 32
+DEPTH = 3
+
+
+class Cifar10DataSet(object):
+  """Cifar10 data set.
+
+  Described by http://www.cs.toronto.edu/~kriz/cifar.html.
+  """
+
+  def __init__(self, data_dir, subset='train', use_distortion=True):
+    self.data_dir = data_dir
+    self.subset = subset
+    self.use_distortion = use_distortion
+
+  def get_filenames(self):
+    if self.subset in ['train', 'validation', 'eval']:
+      return [os.path.join(self.data_dir, self.subset + '.tfrecords')]
+    else:
+      raise ValueError('Invalid data subset "%s"' % self.subset)
+
+  def parser(self, serialized_example):
+    """Parses a single tf.Example into image and label tensors."""
+    # Dimensions of the images in the CIFAR-10 dataset.
+    # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
+    # input format.
+    features = tf.parse_single_example(
+        serialized_example,
+        features={
+            'image': tf.FixedLenFeature([], tf.string),
+            'label': tf.FixedLenFeature([], tf.int64),
+        })
+    image = tf.decode_raw(features['image'], tf.uint8)
+    image.set_shape([DEPTH * HEIGHT * WIDTH])
+
+    # Reshape from [depth * height * width] to [depth, height, width].
+    image = tf.cast(
+        tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
+        tf.float32)
+    label = tf.cast(features['label'], tf.int32)
+
+    # Custom preprocessing.
+    image = self.preprocess(image)
+
+    return image, label
+
+  def make_batch(self, batch_size):
+    """Read the images and labels from 'filenames'."""
+    filenames = self.get_filenames()
+    # Repeat infinitely.
+    dataset = tf.data.TFRecordDataset(filenames).repeat()
+
+    # Parse records.
+    dataset = dataset.map(
+        self.parser)
+
+    # Potentially shuffle records.
+    if self.subset == 'train':
+      min_queue_examples = int(
+          Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4)
+      # Ensure that the capacity is sufficiently large to provide good random
+      # shuffling.
+      dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)
+
+    # Batch it up.
+    dataset = dataset.batch(batch_size)
+    iterator = dataset.make_one_shot_iterator()
+    image_batch, label_batch = iterator.get_next()
+
+    return image_batch, label_batch
+
+  def preprocess(self, image):
+    """Preprocess a single image in [height, width, depth] layout."""
+    if self.subset == 'train' and self.use_distortion:
+      # Pad 4 pixels on each dimension of feature map, done in mini-batch
+      image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
+      image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
+      image = tf.image.random_flip_left_right(image)
+    return image
+
+  @staticmethod
+  def num_examples_per_epoch(subset='train'):
+    if subset == 'train':
+      return 45000
+    elif subset == 'validation':
+      return 5000
+    elif subset == 'eval':
+      return 10000
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py
new file mode 100644
index 0000000..086c95b
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py
@@ -0,0 +1,521 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet model for classifying images from CIFAR-10 dataset.
+
+Support single-host training with one or multiple devices.
+
+ResNet as proposed in:
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+Deep Residual Learning for Image Recognition. arXiv:1512.03385
+
+CIFAR-10 as in:
+http://www.cs.toronto.edu/~kriz/cifar.html
+
+
+"""
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import itertools
+import os
+
+import cifar10
+import cifar10_model
+import cifar10_utils
+import numpy as np
+import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+
+def get_model_fn(num_gpus, variable_strategy, num_workers):
+  """Returns a function that will build the resnet model."""
+
+  def _resnet_model_fn(features, labels, mode, params):
+    """Resnet model body.
+
+    Support single host, one or more GPU training. Parameter distribution can
+    be either one of the following scheme.
+    1. CPU is the parameter server and manages gradient updates.
+    2. Parameters are distributed evenly across all GPUs, and the first GPU
+       manages gradient updates.
+
+    Args:
+      features: a list of tensors, one for each tower
+      labels: a list of tensors, one for each tower
+      mode: ModeKeys.TRAIN or EVAL
+      params: Hyperparameters suitable for tuning
+    Returns:
+      A EstimatorSpec object.
+    """
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+    weight_decay = params.weight_decay
+    momentum = params.momentum
+
+    tower_features = features
+    tower_labels = labels
+    tower_losses = []
+    tower_gradvars = []
+    tower_preds = []
+
+    # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
+    # on CPU. The exception is Intel MKL on CPU which is optimal with
+    # channels_last.
+    data_format = params.data_format
+    if not data_format:
+      if num_gpus == 0:
+        data_format = 'channels_last'
+      else:
+        data_format = 'channels_first'
+
+    if num_gpus == 0:
+      num_devices = 1
+      device_type = 'cpu'
+    else:
+      num_devices = num_gpus
+      device_type = 'gpu'
+
+    for i in range(num_devices):
+      worker_device = '/{}:{}'.format(device_type, i)
+      if variable_strategy == 'CPU':
+        device_setter = cifar10_utils.local_device_setter(
+            worker_device=worker_device)
+      elif variable_strategy == 'GPU':
+        device_setter = cifar10_utils.local_device_setter(
+            ps_device_type='gpu',
+            worker_device=worker_device,
+            ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
+                num_gpus, tf.contrib.training.byte_size_load_fn))
+      with tf.variable_scope('resnet', reuse=bool(i != 0)):
+        with tf.name_scope('tower_%d' % i) as name_scope:
+          with tf.device(device_setter):
+            loss, gradvars, preds = _tower_fn(
+                is_training, weight_decay, tower_features[i], tower_labels[i],
+                data_format, params.num_layers, params.batch_norm_decay,
+                params.batch_norm_epsilon)
+            tower_losses.append(loss)
+            tower_gradvars.append(gradvars)
+            tower_preds.append(preds)
+            if i == 0:
+              # Only trigger batch_norm moving mean and variance update from
+              # the 1st tower. Ideally, we should grab the updates from all
+              # towers but these stats accumulate extremely fast so we can
+              # ignore the other stats from the other towers without
+              # significant detriment.
+              update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
+                                             name_scope)
+
+    # Now compute global loss and gradients.
+    gradvars = []
+    with tf.name_scope('gradient_averaging'):
+      all_grads = {}
+      for grad, var in itertools.chain(*tower_gradvars):
+        if grad is not None:
+          all_grads.setdefault(var, []).append(grad)
+      for var, grads in six.iteritems(all_grads):
+        # Average gradients on the same device as the variables
+        # to which they apply.
+        with tf.device(var.device):
+          if len(grads) == 1:
+            avg_grad = grads[0]
+          else:
+            avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
+        gradvars.append((avg_grad, var))
+
+    # Device that runs the ops to apply global gradient updates.
+    consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
+    with tf.device(consolidation_device):
+      # Suggested learning rate scheduling from
+      # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
+      num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
+          'train') // (params.train_batch_size * num_workers)
+      boundaries = [
+          num_batches_per_epoch * x
+          for x in np.array([82, 123, 300], dtype=np.int64)
+      ]
+      staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
+
+      learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
+                                                  boundaries, staged_lr)
+
+      loss = tf.reduce_mean(tower_losses, name='loss')
+
+      examples_sec_hook = cifar10_utils.ExamplesPerSecondHook(
+          params.train_batch_size, every_n_steps=10)
+
+      tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}
+
+      logging_hook = tf.train.LoggingTensorHook(
+          tensors=tensors_to_log, every_n_iter=100)
+
+      train_hooks = [logging_hook, examples_sec_hook]
+
+      optimizer = tf.train.MomentumOptimizer(
+          learning_rate=learning_rate, momentum=momentum)
+
+      if params.sync:
+        optimizer = tf.train.SyncReplicasOptimizer(
+            optimizer, replicas_to_aggregate=num_workers)
+        sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief)
+        train_hooks.append(sync_replicas_hook)
+
+      # Create single grouped train op
+      train_op = [
+          optimizer.apply_gradients(
+              gradvars, global_step=tf.train.get_global_step())
+      ]
+      train_op.extend(update_ops)
+      train_op = tf.group(*train_op)
+
+      predictions = {
+          'classes':
+              tf.concat([p['classes'] for p in tower_preds], axis=0),
+          'probabilities':
+              tf.concat([p['probabilities'] for p in tower_preds], axis=0)
+      }
+      stacked_labels = tf.concat(labels, axis=0)
+      metrics = {
+          'accuracy':
+              tf.metrics.accuracy(stacked_labels, predictions['classes'])
+      }
+
+    return tf.estimator.EstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        training_hooks=train_hooks,
+        eval_metric_ops=metrics)
+
+  return _resnet_model_fn
+
+
+def _tower_fn(is_training, weight_decay, feature, label, data_format,
+              num_layers, batch_norm_decay, batch_norm_epsilon):
+  """Build computation tower (Resnet).
+
+  Args:
+    is_training: true if is training graph.
+    weight_decay: weight regularization strength, a float.
+    feature: a Tensor.
+    label: a Tensor.
+    data_format: channels_last (NHWC) or channels_first (NCHW).
+    num_layers: number of layers, an int.
+    batch_norm_decay: decay for batch normalization, a float.
+    batch_norm_epsilon: epsilon for batch normalization, a float.
+
+  Returns:
+    A tuple with the loss for the tower, the gradients and parameters, and
+    predictions.
+
+  """
+  model = cifar10_model.ResNetCifar10(
+      num_layers,
+      batch_norm_decay=batch_norm_decay,
+      batch_norm_epsilon=batch_norm_epsilon,
+      is_training=is_training,
+      data_format=data_format)
+  logits = model.forward_pass(feature, input_data_format='channels_last')
+  tower_pred = {
+      'classes': tf.argmax(input=logits, axis=1),
+      'probabilities': tf.nn.softmax(logits)
+  }
+
+  tower_loss = tf.losses.sparse_softmax_cross_entropy(
+      logits=logits, labels=label)
+  tower_loss = tf.reduce_mean(tower_loss)
+
+  model_params = tf.trainable_variables()
+  tower_loss += weight_decay * tf.add_n(
+      [tf.nn.l2_loss(v) for v in model_params])
+
+  tower_grad = tf.gradients(tower_loss, model_params)
+
+  return tower_loss, zip(tower_grad, model_params), tower_pred
+
+
+def input_fn(data_dir,
+             subset,
+             num_shards,
+             batch_size,
+             use_distortion_for_training=True):
+  """Create input graph for model.
+
+  Args:
+    data_dir: Directory where TFRecords representing the dataset are located.
+    subset: one of 'train', 'validate' and 'eval'.
+    num_shards: num of towers participating in data-parallel training.
+    batch_size: total batch size for training to be divided by the number of
+    shards.
+    use_distortion_for_training: True to use distortions.
+  Returns:
+    two lists of tensors for features and labels, each of num_shards length.
+  """
+  with tf.device('/cpu:0'):
+    use_distortion = subset == 'train' and use_distortion_for_training
+    dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion)
+    image_batch, label_batch = dataset.make_batch(batch_size)
+    if num_shards <= 1:
+      # No GPU available or only 1 GPU.
+      return [image_batch], [label_batch]
+
+    # Note that passing num=batch_size is safe here, even though
+    # dataset.batch(batch_size) can, in some cases, return fewer than batch_size
+    # examples. This is because it does so only when repeating for a limited
+    # number of epochs, but our dataset repeats forever.
+    image_batch = tf.unstack(image_batch, num=batch_size, axis=0)
+    label_batch = tf.unstack(label_batch, num=batch_size, axis=0)
+    feature_shards = [[] for i in range(num_shards)]
+    label_shards = [[] for i in range(num_shards)]
+    for i in xrange(batch_size):
+      idx = i % num_shards
+      feature_shards[idx].append(image_batch[i])
+      label_shards[idx].append(label_batch[i])
+    feature_shards = [tf.parallel_stack(x) for x in feature_shards]
+    label_shards = [tf.parallel_stack(x) for x in label_shards]
+    return feature_shards, label_shards
+
+
+def get_experiment_fn(data_dir,
+                      num_gpus,
+                      variable_strategy,
+                      use_distortion_for_training=True):
+  """Returns an Experiment function.
+
+  Experiments perform training on several workers in parallel,
+  in other words experiments know how to invoke train and eval in a sensible
+  fashion for distributed training. Arguments passed directly to this
+  function are not tunable, all other arguments should be passed within
+  tf.HParams, passed to the enclosed function.
+
+  Args:
+      data_dir: str. Location of the data for input_fns.
+      num_gpus: int. Number of GPUs on each worker.
+      variable_strategy: String. CPU to use CPU as the parameter server
+      and GPU to use the GPUs as the parameter server.
+      use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
+  Returns:
+      A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
+      tf.contrib.learn.Experiment.
+
+      Suitable for use by tf.contrib.learn.learn_runner, which will run various
+      methods on Experiment (train, evaluate) based on information
+      about the current runner in `run_config`.
+  """
+
+  def _experiment_fn(run_config, hparams):
+    """Returns an Experiment."""
+    # Create estimator.
+    train_input_fn = functools.partial(
+        input_fn,
+        data_dir,
+        subset='train',
+        num_shards=num_gpus,
+        batch_size=hparams.train_batch_size,
+        use_distortion_for_training=use_distortion_for_training)
+
+    eval_input_fn = functools.partial(
+        input_fn,
+        data_dir,
+        subset='eval',
+        batch_size=hparams.eval_batch_size,
+        num_shards=num_gpus)
+
+    num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
+    if num_eval_examples % hparams.eval_batch_size != 0:
+      raise ValueError(
+          'validation set size must be multiple of eval_batch_size')
+
+    train_steps = hparams.train_steps
+    eval_steps = num_eval_examples // hparams.eval_batch_size
+
+    classifier = tf.estimator.Estimator(
+        model_fn=get_model_fn(num_gpus, variable_strategy,
+                              run_config.num_worker_replicas or 1),
+        config=run_config,
+        params=hparams)
+
+    # Create experiment.
+    return tf.contrib.learn.Experiment(
+        classifier,
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        train_steps=train_steps,
+        eval_steps=eval_steps)
+
+  return _experiment_fn
+
+
+def main(job_dir, data_dir, num_gpus, variable_strategy,
+         use_distortion_for_training, log_device_placement, num_intra_threads,
+         **hparams):
+  # The env variable is on deprecation path, default is set to off.
+  os.environ['TF_SYNC_ON_FINISH'] = '0'
+  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+
+  # Session configuration.
+  sess_config = tf.ConfigProto(
+      allow_soft_placement=True,
+      log_device_placement=log_device_placement,
+      intra_op_parallelism_threads=num_intra_threads,
+      gpu_options=tf.GPUOptions(force_gpu_compatible=True))
+
+  config = cifar10_utils.RunConfig(
+      session_config=sess_config, model_dir=job_dir)
+  tf.contrib.learn.learn_runner.run(
+      get_experiment_fn(data_dir, num_gpus, variable_strategy,
+                        use_distortion_for_training),
+      run_config=config,
+      hparams=tf.contrib.training.HParams(
+          is_chief=config.is_chief,
+          **hparams))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--data-dir',
+      type=str,
+      required=True,
+      help='The directory where the CIFAR-10 input data is stored.')
+  parser.add_argument(
+      '--job-dir',
+      type=str,
+      required=True,
+      help='The directory where the model will be stored.')
+  parser.add_argument(
+      '--variable-strategy',
+      choices=['CPU', 'GPU'],
+      type=str,
+      default='CPU',
+      help='Where to locate variable operations')
+  parser.add_argument(
+      '--num-gpus',
+      type=int,
+      default=1,
+      help='The number of gpus used. Uses only CPU if set to 0.')
+  parser.add_argument(
+      '--num-layers',
+      type=int,
+      default=44,
+      help='The number of layers of the model.')
+  parser.add_argument(
+      '--train-steps',
+      type=int,
+      default=80000,
+      help='The number of steps to use for training.')
+  parser.add_argument(
+      '--train-batch-size',
+      type=int,
+      default=128,
+      help='Batch size for training.')
+  parser.add_argument(
+      '--eval-batch-size',
+      type=int,
+      default=100,
+      help='Batch size for validation.')
+  parser.add_argument(
+      '--momentum',
+      type=float,
+      default=0.9,
+      help='Momentum for MomentumOptimizer.')
+  parser.add_argument(
+      '--weight-decay',
+      type=float,
+      default=2e-4,
+      help='Weight decay for convolutions.')
+  parser.add_argument(
+      '--learning-rate',
+      type=float,
+      default=0.1,
+      help="""\
+      This is the inital learning rate value. The learning rate will decrease
+      during training. For more details check the model_fn implementation in
+      this file.\
+      """)
+  parser.add_argument(
+      '--use-distortion-for-training',
+      type=bool,
+      default=True,
+      help='If doing image distortion for training.')
+  parser.add_argument(
+      '--sync',
+      action='store_true',
+      default=False,
+      help="""\
+      If present when running in a distributed environment will run on sync mode.\
+      """)
+  parser.add_argument(
+      '--num-intra-threads',
+      type=int,
+      default=0,
+      help="""\
+      Number of threads to use for intra-op parallelism. When training on CPU
+      set to 0 to have the system pick the appropriate number or alternatively
+      set it to the number of physical CPU cores.\
+      """)
+  parser.add_argument(
+      '--num-inter-threads',
+      type=int,
+      default=0,
+      help="""\
+      Number of threads to use for inter-op parallelism. If set to 0, the
+      system will pick an appropriate number.\
+      """)
+  parser.add_argument(
+      '--data-format',
+      type=str,
+      default=None,
+      help="""\
+      If not set, the data format best for the training device is used.
+      Allowed values: channels_first (NCHW) channels_last (NHWC).\
+      """)
+  parser.add_argument(
+      '--log-device-placement',
+      action='store_true',
+      default=False,
+      help='Whether to log device placement.')
+  parser.add_argument(
+      '--batch-norm-decay',
+      type=float,
+      default=0.997,
+      help='Decay for batch norm.')
+  parser.add_argument(
+      '--batch-norm-epsilon',
+      type=float,
+      default=1e-5,
+      help='Epsilon for batch norm.')
+  args = parser.parse_args()
+
+  if args.num_gpus > 0:
+    assert tf.test.is_gpu_available(), "Requested GPUs but none found."
+  if args.num_gpus < 0:
+    raise ValueError(
+        'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
+  if args.num_gpus == 0 and args.variable_strategy == 'GPU':
+    raise ValueError('num-gpus=0, CPU must be used as parameter server. Set'
+                     '--variable-strategy=CPU.')
+  if (args.num_layers - 2) % 6 != 0:
+    raise ValueError('Invalid --num-layers parameter.')
+  if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
+    raise ValueError('--train-batch-size must be multiple of --num-gpus.')
+  if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
+    raise ValueError('--eval-batch-size must be multiple of --num-gpus.')
+
+  main(**vars(args))

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py
new file mode 100644
index 0000000..d67c233
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model class for Cifar10 Dataset."""
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import model_base
+
+
+class ResNetCifar10(model_base.ResNet):
+  """Cifar10 model with ResNetV1 and basic residual block."""
+
+  def __init__(self,
+               num_layers,
+               is_training,
+               batch_norm_decay,
+               batch_norm_epsilon,
+               data_format='channels_first'):
+    super(ResNetCifar10, self).__init__(
+        is_training,
+        data_format,
+        batch_norm_decay,
+        batch_norm_epsilon
+    )
+    self.n = (num_layers - 2) // 6
+    # Add one in case label starts with 1. No impact if label starts with 0.
+    self.num_classes = 10 + 1
+    self.filters = [16, 16, 32, 64]
+    self.strides = [1, 2, 2]
+
+  def forward_pass(self, x, input_data_format='channels_last'):
+    """Build the core model within the graph."""
+    if self._data_format != input_data_format:
+      if input_data_format == 'channels_last':
+        # Computation requires channels_first.
+        x = tf.transpose(x, [0, 3, 1, 2])
+      else:
+        # Computation requires channels_last.
+        x = tf.transpose(x, [0, 2, 3, 1])
+
+    # Image standardization.
+    x = x / 128 - 1
+
+    x = self._conv(x, 3, 16, 1)
+    x = self._batch_norm(x)
+    x = self._relu(x)
+
+    # Use basic (non-bottleneck) block and ResNet V1 (post-activation).
+    res_func = self._residual_v1
+
+    # 3 stages of block stacking.
+    for i in range(3):
+      with tf.name_scope('stage'):
+        for j in range(self.n):
+          if j == 0:
+            # First block in a stage, filters and strides may change.
+            x = res_func(x, 3, self.filters[i], self.filters[i + 1],
+                         self.strides[i])
+          else:
+            # Following blocks in a stage, constant filters and unit stride.
+            x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1)
+
+    x = self._global_avg_pool(x)
+    x = self._fully_connected(x, self.num_classes)
+
+    return x

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py
new file mode 100644
index 0000000..7ecb50a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py
@@ -0,0 +1,154 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import collections
+import six
+
+import tensorflow as tf
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+from tensorflow.python.training import device_setter
+from tensorflow.contrib.learn.python.learn import run_config
+
+
+# TODO(b/64848083) Remove once uid bug is fixed
+class RunConfig(tf.contrib.learn.RunConfig):
+  def uid(self, whitelist=None):
+    """Generates a 'Unique Identifier' based on all internal fields.
+    Caller should use the uid string to check `RunConfig` instance integrity
+    in one session use, but should not rely on the implementation details, which
+    is subject to change.
+    Args:
+      whitelist: A list of the string names of the properties uid should not
+        include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
+        includes most properties user allowes to change.
+    Returns:
+      A uid string.
+    """
+    if whitelist is None:
+      whitelist = run_config._DEFAULT_UID_WHITE_LIST
+
+    state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
+    # Pop out the keys in whitelist.
+    for k in whitelist:
+      state.pop('_' + k, None)
+
+    ordered_state = collections.OrderedDict(
+        sorted(state.items(), key=lambda t: t[0]))
+    # For class instance without __repr__, some special cares are required.
+    # Otherwise, the object address will be used.
+    if '_cluster_spec' in ordered_state:
+      ordered_state['_cluster_spec'] = collections.OrderedDict(
+         sorted(ordered_state['_cluster_spec'].as_dict().items(),
+                key=lambda t: t[0])
+      )
+    return ', '.join(
+        '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
+
+
+class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
+  """Hook to print out examples per second.
+
+    Total time is tracked and then divided by the total number of steps
+    to get the average step time and then batch_size is used to determine
+    the running average of examples per second. The examples per second for the
+    most recent interval is also logged.
+  """
+
+  def __init__(
+      self,
+      batch_size,
+      every_n_steps=100,
+      every_n_secs=None,):
+    """Initializer for ExamplesPerSecondHook.
+
+      Args:
+      batch_size: Total batch size used to calculate examples/second from
+      global time.
+      every_n_steps: Log stats every n steps.
+      every_n_secs: Log stats every n seconds.
+    """
+    if (every_n_steps is None) == (every_n_secs is None):
+      raise ValueError('exactly one of every_n_steps'
+                       ' and every_n_secs should be provided.')
+    self._timer = basic_session_run_hooks.SecondOrStepTimer(
+        every_steps=every_n_steps, every_secs=every_n_secs)
+
+    self._step_train_time = 0
+    self._total_steps = 0
+    self._batch_size = batch_size
+
+  def begin(self):
+    self._global_step_tensor = training_util.get_global_step()
+    if self._global_step_tensor is None:
+      raise RuntimeError(
+          'Global step should be created to use StepCounterHook.')
+
+  def before_run(self, run_context):  # pylint: disable=unused-argument
+    return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
+
+  def after_run(self, run_context, run_values):
+    _ = run_context
+
+    global_step = run_values.results
+    if self._timer.should_trigger_for_step(global_step):
+      elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
+          global_step)
+      if elapsed_time is not None:
+        steps_per_sec = elapsed_steps / elapsed_time
+        self._step_train_time += elapsed_time
+        self._total_steps += elapsed_steps
+
+        average_examples_per_sec = self._batch_size * (
+            self._total_steps / self._step_train_time)
+        current_examples_per_sec = steps_per_sec * self._batch_size
+        # Average examples/sec followed by current examples/sec
+        logging.info('%s: %g (%g), step = %g', 'Average examples/sec',
+                     average_examples_per_sec, current_examples_per_sec,
+                     self._total_steps)
+
+def local_device_setter(num_devices=1,
+                        ps_device_type='cpu',
+                        worker_device='/cpu:0',
+                        ps_ops=None,
+                        ps_strategy=None):
+  if ps_ops == None:
+    ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
+
+  if ps_strategy is None:
+    ps_strategy = device_setter._RoundRobinStrategy(num_devices)
+  if not six.callable(ps_strategy):
+    raise TypeError("ps_strategy must be callable")
+
+  def _local_device_chooser(op):
+    current_device = pydev.DeviceSpec.from_string(op.device or "")
+
+    node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+    if node_def.op in ps_ops:
+      ps_device_spec = pydev.DeviceSpec.from_string(
+          '/{}:{}'.format(ps_device_type, ps_strategy(op)))
+
+      ps_device_spec.merge_from(current_device)
+      return ps_device_spec.to_string()
+    else:
+      worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
+      worker_device_spec.merge_from(current_device)
+      return worker_device_spec.to_string()
+  return _local_device_chooser

http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py
new file mode 100644
index 0000000..409cee4
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
+
+Generates tf.train.Example protos and writes them to TFRecord files from the
+python version of the CIFAR-10 dataset downloaded from
+https://www.cs.toronto.edu/~kriz/cifar.html.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+
+import tarfile
+from six.moves import cPickle as pickle
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+CIFAR_FILENAME = 'cifar-10-python.tar.gz'
+CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
+CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
+
+
+def download_and_extract(data_dir):
+  # download CIFAR-10 if not already downloaded.
+  tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
+                                                CIFAR_DOWNLOAD_URL)
+  tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
+               'r:gz').extractall(data_dir)
+
+
+def _int64_feature(value):
+  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+
+def _bytes_feature(value):
+  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _get_file_names():
+  """Returns the file names expected to exist in the input_dir."""
+  file_names = {}
+  file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
+  file_names['validation'] = ['data_batch_5']
+  file_names['eval'] = ['test_batch']
+  return file_names
+
+
+def read_pickle_from_file(filename):
+  with tf.gfile.Open(filename, 'rb') as f:
+    data_dict = pickle.load(f)
+  return data_dict
+
+
+def convert_to_tfrecord(input_files, output_file):
+  """Converts a file to TFRecords."""
+  print('Generating %s' % output_file)
+  with tf.python_io.TFRecordWriter(output_file) as record_writer:
+    for input_file in input_files:
+      data_dict = read_pickle_from_file(input_file)
+      data = data_dict['data']
+      labels = data_dict['labels']
+      num_entries_in_batch = len(labels)
+      for i in range(num_entries_in_batch):
+        example = tf.train.Example(features=tf.train.Features(
+            feature={
+                'image': _bytes_feature(data[i].tobytes()),
+                'label': _int64_feature(labels[i])
+            }))
+        record_writer.write(example.SerializeToString())
+
+
+def main(data_dir):
+  print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
+  download_and_extract(data_dir)
+  file_names = _get_file_names()
+  input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
+  for mode, files in file_names.items():
+    input_files = [os.path.join(input_dir, f) for f in files]
+    output_file = os.path.join(data_dir, mode + '.tfrecords')
+    try:
+      os.remove(output_file)
+    except OSError:
+      pass
+    # Convert to tf.train.Example and write the to TFRecords.
+    convert_to_tfrecord(input_files, output_file)
+  print('Done!')
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      '--data-dir',
+      type=str,
+      default='',
+      help='Directory to download and extract CIFAR-10 to.')
+
+  args = parser.parse_args()
+  main(args.data_dir)


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org