You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by su...@apache.org on 2018/09/28 18:31:37 UTC
[2/2] hadoop git commit: YARN-8800. Updated documentation of
Submarine with latest examples. Contributed by Wangda Tan.
YARN-8800. Updated documentation of Submarine with latest examples. Contributed by Wangda Tan.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/19ad5be6
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/19ad5be6
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/19ad5be6
Branch: refs/heads/trunk
Commit: 19ad5be6517765b31b4afaccbaadc8268627c568
Parents: 72891fc
Author: Sunil G <su...@apache.org>
Authored: Sat Sep 29 00:01:04 2018 +0530
Committer: Sunil G <su...@apache.org>
Committed: Sat Sep 29 00:01:04 2018 +0530
----------------------------------------------------------------------
hadoop-project/src/site/site.xml | 6 +
.../hadoop-yarn-submarine/README.md | 6 +-
.../base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 | 69 +++
.../base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 | 67 +++
.../src/main/docker/build-all.sh | 32 ++
.../ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 | 22 +
.../ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 | 22 +
.../cifar10_estimator_tf_1.8.0/README.md | 542 ++++++++++++++++++
.../cifar10_estimator_tf_1.8.0/cifar10.py | 113 ++++
.../cifar10_estimator_tf_1.8.0/cifar10_main.py | 521 +++++++++++++++++
.../cifar10_estimator_tf_1.8.0/cifar10_model.py | 80 +++
.../cifar10_estimator_tf_1.8.0/cifar10_utils.py | 154 +++++
.../generate_cifar10_tfrecords.py | 114 ++++
.../cifar10_estimator_tf_1.8.0/model_base.py | 219 +++++++
.../zeppelin-notebook-example/Dockerfile.gpu | 75 +++
.../zeppelin-notebook-example/run_container.sh | 22 +
.../docker/zeppelin-notebook-example/shiro.ini | 120 ++++
.../zeppelin-notebook-example/zeppelin-site.xml | 569 +++++++++++++++++++
.../src/site/DeveloperGuide.md | 26 -
.../src/site/QuickStart.md | 134 -----
.../src/site/markdown/DeveloperGuide.md | 24 +
.../src/site/markdown/Examples.md | 21 +
.../src/site/markdown/Index.md | 42 ++
.../src/site/markdown/QuickStart.md | 174 ++++++
.../markdown/RunningDistributedCifar10TFJobs.md | 162 ++++++
.../src/site/markdown/RunningZeppelinOnYARN.md | 37 ++
.../src/site/markdown/WriteDockerfile.md | 117 ++++
.../src/site/resources/css/site.css | 29 +
.../src/site/resources/images/job-logs-ui.png | Bin 0 -> 229944 bytes
.../images/multiple-tensorboard-jobs.png | Bin 0 -> 184717 bytes
.../resources/images/tensorboard-service.png | Bin 0 -> 107567 bytes
.../hadoop-yarn-submarine/src/site/site.xml | 28 +
32 files changed, 3385 insertions(+), 162 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-project/src/site/site.xml
----------------------------------------------------------------------
diff --git a/hadoop-project/src/site/site.xml b/hadoop-project/src/site/site.xml
index b40dbfc..2b6058e 100644
--- a/hadoop-project/src/site/site.xml
+++ b/hadoop-project/src/site/site.xml
@@ -180,6 +180,12 @@
<item name="System Services" href="hadoop-yarn/hadoop-yarn-site/yarn-service/SystemServices.html"/>
</menu>
+ <menu name="Submarine" inherit="top">
+ <item name="Index" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Index.html"/>
+ <item name="QuickStart" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/QuickStart.html"/>
+ <item name="Examples" href="hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/Examples.html"/>
+ </menu>
+
<menu name="Hadoop Compatible File Systems" inherit="top">
<item name="Aliyun OSS" href="hadoop-aliyun/tools/hadoop-aliyun/index.html"/>
<item name="Amazon S3" href="hadoop-aws/tools/hadoop-aws/index.html"/>
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
index 3e04730..cb2e2da 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/README.md
@@ -48,6 +48,8 @@ Goals of Submarine:
- Support launch tensorboard for training jobs if user specified.
- Support customized DNS name for roles (like tensorboard.$user.$domain:6006)
-Please jump to [QuickStart](src/site/QuickStart.md) guide to quickly understand how to use this framework.
+Please jump to [QuickStart](src/site/markdown/QuickStart.md) guide to quickly understand how to use this framework.
-If you're a developer, please find [Developer](src/site/DeveloperGuide.md) guide for more details.
+Please jump to [Examples](src/site/markdown/Examples.md) to try other examples like running Distributed Tensorflow Training for CIFAR 10.
+
+If you're a developer, please find [Developer](src/site/markdown/DeveloperGuide.md) guide for more details.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
new file mode 100644
index 0000000..f2446a7
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
@@ -0,0 +1,69 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM ubuntu:16.04
+
+LABEL maintainer="Craig Citro <cr...@google.com>"
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ build-essential \
+ curl \
+ libfreetype6-dev \
+ libpng12-dev \
+ libzmq3-dev \
+ pkg-config \
+ python \
+ python-dev \
+ rsync \
+ software-properties-common \
+ unzip \
+ && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+ python get-pip.py && \
+ rm get-pip.py
+
+RUN pip --no-cache-dir install \
+ Pillow \
+ h5py \
+ ipykernel \
+ jupyter \
+ matplotlib \
+ numpy \
+ pandas \
+ scipy \
+ sklearn \
+ && \
+ python -m ipykernel.kernelspec
+
+# --- DO NOT EDIT OR DELETE BETWEEN THE LINES --- #
+# These lines will be edited automatically by parameterized_docker_build.sh. #
+# COPY _PIP_FILE_ /
+# RUN pip --no-cache-dir install /_PIP_FILE_
+# RUN rm -f /_PIP_FILE_
+
+# Install TensorFlow CPU version from central repo
+RUN pip --no-cache-dir install \
+ http://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+
+RUN apt-get update && apt-get install git -y
+
+RUN apt-get update && apt-get install -y openjdk-8-jdk wget
+RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz
+RUN tar zxf hadoop-3.1.1.tar.gz
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
new file mode 100644
index 0000000..dee6e19
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/base/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04
+
+# Pick up some TF dependencies
+RUN apt-get update && apt-get install -y --allow-downgrades --no-install-recommends \
+ build-essential \
+ cuda-command-line-tools-9-0 \
+ cuda-cublas-9-0 \
+ cuda-cufft-9-0 \
+ cuda-curand-9-0 \
+ cuda-cusolver-9-0 \
+ cuda-cusparse-9-0 \
+ curl \
+ libcudnn7=7.0.5.15-1+cuda9.0 \
+ libfreetype6-dev \
+ libpng12-dev \
+ libzmq3-dev \
+ pkg-config \
+ python \
+ python-dev \
+ rsync \
+ software-properties-common \
+ unzip \
+ && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && \
+ python get-pip.py && \
+ rm get-pip.py
+
+RUN pip --no-cache-dir install \
+ Pillow \
+ h5py \
+ ipykernel \
+ jupyter \
+ matplotlib \
+ numpy \
+ pandas \
+ scipy \
+ sklearn \
+ && \
+ python -m ipykernel.kernelspec
+
+# Install TensorFlow GPU version.
+RUN pip --no-cache-dir install \
+ http://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+RUN apt-get update && apt-get install git -y
+
+RUN apt-get update && apt-get install -y openjdk-8-jdk wget
+RUN wget http://apache.cs.utah.edu/hadoop/common/hadoop-3.1.0/hadoop-3.1.0.tar.gz
+RUN tar zxf hadoop-3.1.0.tar.gz
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh
new file mode 100755
index 0000000..ad3a935
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/build-all.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Building base images"
+
+set -e
+
+cd base/ubuntu-16.04
+
+docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu-base:0.0.1
+docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu-base:0.0.1
+
+echo "Finished building base images"
+
+cd ../../with-cifar10-models/ubuntu-16.04
+
+docker build . -f Dockerfile.cpu.tf_1.8.0 -t tf-1.8.0-cpu:0.0.1
+docker build . -f Dockerfile.gpu.tf_1.8.0 -t tf-1.8.0-gpu:0.0.1
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
new file mode 100644
index 0000000..1087d61
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.cpu.tf_1.8.0
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM tf-1.8.0-cpu-base:0.0.1
+
+# Include models
+RUN mkdir /test
+ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
+RUN chown -R nobody /test
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0 b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
new file mode 100644
index 0000000..d1f829f
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/Dockerfile.gpu.tf_1.8.0
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FROM tf-1.8.0-gpu-base:0.0.1
+
+# Include models
+RUN mkdir /test
+ADD cifar10_estimator_tf_1.8.0 /test/cifar10_estimator
+RUN chown -R nobody /test
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md
new file mode 100644
index 0000000..5b4ae34
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/README.md
@@ -0,0 +1,542 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+(Copied from https://github.com/tensorflow/models/tree/master/tutorials/image/cifar10_estimator)
+
+CIFAR-10 is a common benchmark in machine learning for image recognition.
+
+http://www.cs.toronto.edu/~kriz/cifar.html
+
+Code in this directory focuses on how to use TensorFlow Estimators to train and
+evaluate a CIFAR-10 ResNet model on:
+
+* A single host with one CPU;
+* A single host with multiple GPUs;
+* Multiple hosts with CPU or multiple GPUs;
+
+Before trying to run the model we highly encourage you to read all the README.
+
+## Prerequisite
+
+1. [Install](https://www.tensorflow.org/install/) TensorFlow version 1.2.1 or
+later.
+
+2. Download the CIFAR-10 dataset and generate TFRecord files using the provided
+script. The script and associated command below will download the CIFAR-10
+dataset and then generate a TFRecord for the training, validation, and
+evaluation datasets.
+
+```shell
+python generate_cifar10_tfrecords.py --data-dir=${PWD}/cifar-10-data
+```
+
+After running the command above, you should see the following files in the
+--data-dir (```ls -R cifar-10-data```):
+
+* train.tfrecords
+* validation.tfrecords
+* eval.tfrecords
+
+
+## Training on a single machine with GPUs or CPU
+
+Run the training on CPU only. After training, it runs the evaluation.
+
+```
+python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
+ --job-dir=/tmp/cifar10 \
+ --num-gpus=0 \
+ --train-steps=1000
+```
+
+Run the model on 2 GPUs using CPU as parameter server. After training, it runs
+the evaluation.
+```
+python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
+ --job-dir=/tmp/cifar10 \
+ --num-gpus=2 \
+ --train-steps=1000
+```
+
+Run the model on 2 GPUs using GPU as parameter server.
+It will run an experiment, which for local setting basically means it will run
+stop training
+a couple of times to perform evaluation.
+
+```
+python cifar10_main.py --data-dir=${PWD}/cifar-10-data \
+ --job-dir=/tmp/cifar10 \
+ --variable-strategy GPU \
+ --num-gpus=2 \
+```
+
+There are more command line flags to play with; run
+`python cifar10_main.py --help` for details.
+
+## Run distributed training
+
+### (Optional) Running on Google Cloud Machine Learning Engine
+
+This example can be run on Google Cloud Machine Learning Engine (ML Engine),
+which will configure the environment and take care of running workers,
+parameters servers, and masters in a fault tolerant way.
+
+To install the command line tool, and set up a project and billing, see the
+quickstart [here](https://cloud.google.com/ml-engine/docs/quickstarts/command-line).
+
+You'll also need a Google Cloud Storage bucket for the data. If you followed the
+instructions above, you can just run:
+
+```
+MY_BUCKET=gs://<my-bucket-name>
+gsutil cp -r ${PWD}/cifar-10-data $MY_BUCKET/
+```
+
+Then run the following command from the `tutorials/image` directory of this
+repository (the parent directory of this README):
+
+```
+gcloud ml-engine jobs submit training cifarmultigpu \
+ --runtime-version 1.2 \
+ --job-dir=$MY_BUCKET/model_dirs/cifarmultigpu \
+ --config cifar10_estimator/cmle_config.yaml \
+ --package-path cifar10_estimator/ \
+ --module-name cifar10_estimator.cifar10_main \
+ -- \
+ --data-dir=$MY_BUCKET/cifar-10-data \
+ --num-gpus=4 \
+ --train-steps=1000
+```
+
+
+### Set TF_CONFIG
+
+Considering that you already have multiple hosts configured, all you need is a
+`TF_CONFIG` environment variable on each host. You can set up the hosts manually
+or check [tensorflow/ecosystem](https://github.com/tensorflow/ecosystem) for
+instructions about how to set up a Cluster.
+
+The `TF_CONFIG` will be used by the `RunConfig` to know the existing hosts and
+their task: `master`, `ps` or `worker`.
+
+Here's an example of `TF_CONFIG`.
+
+```python
+cluster = {'master': ['master-ip:8000'],
+ 'ps': ['ps-ip:8000'],
+ 'worker': ['worker-ip:8000']}
+
+TF_CONFIG = json.dumps(
+ {'cluster': cluster,
+ 'task': {'type': master, 'index': 0},
+ 'model_dir': 'gs://<bucket_path>/<dir_path>',
+ 'environment': 'cloud'
+ })
+```
+
+*Cluster*
+
+A cluster spec, which is basically a dictionary that describes all of the tasks
+in the cluster. More about it [here](https://www.tensorflow.org/deploy/distributed).
+
+In this cluster spec we are defining a cluster with 1 master, 1 ps and 1 worker.
+
+* `ps`: saves the parameters among all workers. All workers can
+ read/write/update the parameters for model via ps. As some models are
+ extremely large the parameters are shared among the ps (each ps stores a
+ subset).
+
+* `worker`: does the training.
+
+* `master`: basically a special worker, it does training, but also restores and
+ saves checkpoints and do evaluation.
+
+*Task*
+
+The Task defines what is the role of the current node, for this example the node
+is the master on index 0 on the cluster spec, the task will be different for
+each node. An example of the `TF_CONFIG` for a worker would be:
+
+```python
+cluster = {'master': ['master-ip:8000'],
+ 'ps': ['ps-ip:8000'],
+ 'worker': ['worker-ip:8000']}
+
+TF_CONFIG = json.dumps(
+ {'cluster': cluster,
+ 'task': {'type': worker, 'index': 0},
+ 'model_dir': 'gs://<bucket_path>/<dir_path>',
+ 'environment': 'cloud'
+ })
+```
+
+*Model_dir*
+
+This is the path where the master will save the checkpoints, graph and
+TensorBoard files. For a multi host environment you may want to use a
+Distributed File System, Google Storage and DFS are supported.
+
+*Environment*
+
+By the default environment is *local*, for a distributed setting we need to
+change it to *cloud*.
+
+### Running script
+
+Once you have a `TF_CONFIG` configured properly on each host you're ready to run
+on distributed settings.
+
+#### Master
+Run this on master:
+Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
+40000 steps. It will run evaluation a couple of times during training. The
+num_workers arugument is used only to update the learning rate correctly. Make
+sure the model_dir is the same as defined on the TF_CONFIG.
+
+```shell
+python cifar10_main.py --data-dir=gs://path/cifar-10-data \
+ --job-dir=gs://path/model_dir/ \
+ --num-gpus=4 \
+ --train-steps=40000 \
+ --sync \
+ --num-workers=2
+```
+
+*Output:*
+
+```shell
+INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
+INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'master', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fd16fb2be10>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
+gpu_options {
+}
+allow_soft_placement: true
+, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
+ per_process_gpu_memory_fraction: 1.0
+}
+, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
+...
+2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:04.0
+Total memory: 11.17GiB
+Free memory: 11.09GiB
+2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:05.0
+Total memory: 11.17GiB
+Free memory: 11.10GiB
+...
+2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
+INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
+INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=1; total_num_replicas=1
+INFO:tensorflow:Create CheckpointSaverHook.
+INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-0
+2017-08-01 19:59:37.560775: I tensorflow/core/distributed_runtime/master_session.cc:999] Start master session 156fcb55fe6648d6 with config:
+intra_op_parallelism_threads: 1
+gpu_options {
+ per_process_gpu_memory_fraction: 1
+}
+allow_soft_placement: true
+
+INFO:tensorflow:Saving checkpoints for 1 into gs://path/model_dir/model.ckpt.
+INFO:tensorflow:loss = 1.20682, step = 1
+INFO:tensorflow:loss = 1.20682, learning_rate = 0.1
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
+INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
+INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
+INFO:tensorflow:Starting evaluation at 2017-08-01-20:00:14
+2017-08-01 20:00:15.745881: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:0) -> (device: 0, name: Tesla K80, pci bus id: 0000:00:04.0)
+2017-08-01 20:00:15.745949: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:1) -> (device: 1, name: Tesla K80, pci bus id: 0000:00:05.0)
+2017-08-01 20:00:15.745958: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:2) -> (device: 2, name: Tesla K80, pci bus id: 0000:00:06.0)
+2017-08-01 20:00:15.745964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:3) -> (device: 3, name: Tesla K80, pci bus id: 0000:00:07.0)
+2017-08-01 20:00:15.745969: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:4) -> (device: 4, name: Tesla K80, pci bus id: 0000:00:08.0)
+2017-08-01 20:00:15.745975: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:5) -> (device: 5, name: Tesla K80, pci bus id: 0000:00:09.0)
+2017-08-01 20:00:15.745987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:6) -> (device: 6, name: Tesla K80, pci bus id: 0000:00:0a.0)
+2017-08-01 20:00:15.745997: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Creating TensorFlow device (/gpu:7) -> (device: 7, name: Tesla K80, pci bus id: 0000:00:0b.0)
+INFO:tensorflow:Restoring parameters from gs://path/model_dir/model.ckpt-10023
+INFO:tensorflow:Evaluation [1/100]
+INFO:tensorflow:Evaluation [2/100]
+INFO:tensorflow:Evaluation [3/100]
+INFO:tensorflow:Evaluation [4/100]
+INFO:tensorflow:Evaluation [5/100]
+INFO:tensorflow:Evaluation [6/100]
+INFO:tensorflow:Evaluation [7/100]
+INFO:tensorflow:Evaluation [8/100]
+INFO:tensorflow:Evaluation [9/100]
+INFO:tensorflow:Evaluation [10/100]
+INFO:tensorflow:Evaluation [11/100]
+INFO:tensorflow:Evaluation [12/100]
+INFO:tensorflow:Evaluation [13/100]
+...
+INFO:tensorflow:Evaluation [100/100]
+INFO:tensorflow:Finished evaluation at 2017-08-01-20:00:31
+INFO:tensorflow:Saving dict for global step 1: accuracy = 0.0994, global_step = 1, loss = 630.425
+```
+
+#### Worker
+
+Run this on worker:
+Runs an Experiment in sync mode on 4 GPUs using CPU as parameter server for
+40000 steps. It will run evaluation a couple of times during training. Make sure
+the model_dir is the same as defined on the TF_CONFIG.
+
+```shell
+python cifar10_main.py --data-dir=gs://path/cifar-10-data \
+ --job-dir=gs://path/model_dir/ \
+ --num-gpus=4 \
+ --train-steps=40000 \
+ --sync
+```
+
+*Output:*
+
+```shell
+INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
+INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600,
+'_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'worker',
+'_is_chief': False, '_cluster_spec':
+<tensorflow.python.training.server_lib.ClusterSpec object at 0x7f6918438e10>,
+'_model_dir': 'gs://<path>/model_dir/',
+'_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000,
+'_session_config': intra_op_parallelism_threads: 1
+gpu_options {
+}
+allow_soft_placement: true
+, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1,
+'_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
+ per_process_gpu_memory_fraction: 1.0
+ }
+...
+2017-08-01 19:59:26.496208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 0 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:04.0
+Total memory: 11.17GiB
+Free memory: 11.09GiB
+2017-08-01 19:59:26.775660: I tensorflow/core/common_runtime/gpu/gpu_device.cc:940] Found device 1 with properties:
+name: Tesla K80
+major: 3 minor: 7 memoryClockRate (GHz) 0.8235
+pciBusID 0000:00:05.0
+Total memory: 11.17GiB
+Free memory: 11.10GiB
+...
+2017-08-01 19:59:29.675171: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_1/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_2/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_3/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_4/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_5/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage/residual_v1_6/: (?, 16, 32, 32)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/avg_pool/: (?, 16, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_1/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_2/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_3/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_4/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_5/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_1/residual_v1_6/: (?, 32, 16, 16)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/avg_pool/: (?, 32, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_1/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_2/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_3/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_4/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_5/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/stage_2/residual_v1_6/: (?, 64, 8, 8)
+INFO:tensorflow:image after unit resnet/tower_0/global_avg_pool/: (?, 64)
+INFO:tensorflow:image after unit resnet/tower_0/fully_connected/: (?, 11)
+INFO:tensorflow:SyncReplicasV2: replicas_to_aggregate=2; total_num_replicas=2
+INFO:tensorflow:Create CheckpointSaverHook.
+2017-07-31 22:38:04.629150: I
+tensorflow/core/distributed_runtime/master.cc:209] CreateSession still waiting
+for response from worker: /job:master/replica:0/task:0
+2017-07-31 22:38:09.263492: I
+tensorflow/core/distributed_runtime/master_session.cc:999] Start master
+session cc58f93b1e259b0c with config:
+intra_op_parallelism_threads: 1
+gpu_options {
+per_process_gpu_memory_fraction: 1
+}
+allow_soft_placement: true
+INFO:tensorflow:loss = 5.82382, step = 0
+INFO:tensorflow:loss = 5.82382, learning_rate = 0.8
+INFO:tensorflow:Average examples/sec: 1116.92 (1116.92), step = 10
+INFO:tensorflow:Average examples/sec: 1233.73 (1377.83), step = 20
+INFO:tensorflow:Average examples/sec: 1485.43 (2509.3), step = 30
+INFO:tensorflow:Average examples/sec: 1680.27 (2770.39), step = 40
+INFO:tensorflow:Average examples/sec: 1825.38 (2788.78), step = 50
+INFO:tensorflow:Average examples/sec: 1929.32 (2697.27), step = 60
+INFO:tensorflow:Average examples/sec: 2015.17 (2749.05), step = 70
+INFO:tensorflow:loss = 37.6272, step = 79 (19.554 sec)
+INFO:tensorflow:loss = 37.6272, learning_rate = 0.8 (19.554 sec)
+INFO:tensorflow:Average examples/sec: 2074.92 (2618.36), step = 80
+INFO:tensorflow:Average examples/sec: 2132.71 (2744.13), step = 90
+INFO:tensorflow:Average examples/sec: 2183.38 (2777.21), step = 100
+INFO:tensorflow:Average examples/sec: 2224.4 (2739.03), step = 110
+INFO:tensorflow:Average examples/sec: 2240.28 (2431.26), step = 120
+INFO:tensorflow:Average examples/sec: 2272.12 (2739.32), step = 130
+INFO:tensorflow:Average examples/sec: 2300.68 (2750.03), step = 140
+INFO:tensorflow:Average examples/sec: 2325.81 (2745.63), step = 150
+INFO:tensorflow:Average examples/sec: 2347.14 (2721.53), step = 160
+INFO:tensorflow:Average examples/sec: 2367.74 (2754.54), step = 170
+INFO:tensorflow:loss = 27.8453, step = 179 (18.893 sec)
+...
+```
+
+#### PS
+
+Run this on ps:
+The ps will not do training so most of the arguments won't affect the execution
+
+```shell
+python cifar10_main.py --job-dir=gs://path/model_dir/
+```
+
+*Output:*
+
+```shell
+INFO:tensorflow:Using model_dir in TF_CONFIG: gs://path/model_dir/
+INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_num_ps_replicas': 1, '_keep_checkpoint_max': 5, '_task_type': u'ps', '_is_chief': False, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f48f1addf90>, '_model_dir': 'gs://path/model_dir/', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_session_config': intra_op_parallelism_threads: 1
+gpu_options {
+}
+allow_soft_placement: true
+, '_tf_random_seed': None, '_environment': u'cloud', '_num_worker_replicas': 1, '_task_id': 0, '_save_summary_steps': 100, '_tf_config': gpu_options {
+ per_process_gpu_memory_fraction: 1.0
+}
+, '_evaluation_master': '', '_master': u'grpc://master-ip:8000'}
+2017-07-31 22:54:58.928088: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job master -> {0 -> master-ip:8000}
+2017-07-31 22:54:58.928153: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job ps -> {0 -> localhost:8000}
+2017-07-31 22:54:58.928160: I tensorflow/core/distributed_runtime/rpc/grpc_channel.cc:215] Initialize GrpcChannelCache for job worker -> {0 -> worker-ip:8000}
+2017-07-31 22:54:58.929873: I tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc:316] Started server with target: grpc://localhost:8000
+```
+
+## Visualizing results with TensorBoard
+
+When using Estimators you can also visualize your data in TensorBoard, with no
+changes in your code. You can use TensorBoard to visualize your TensorFlow
+graph, plot quantitative metrics about the execution of your graph, and show
+additional data like images that pass through it.
+
+You'll see something similar to this if you "point" TensorBoard to the
+`job dir` parameter you used to train or evaluate your model.
+
+Check TensorBoard during training or after it. Just point TensorBoard to the
+model_dir you chose on the previous step.
+
+```shell
+tensorboard --log-dir="<job dir>"
+```
+
+## Warnings
+
+When runninng `cifar10_main.py` with `--sync` argument you may see an error
+similar to:
+
+```python
+File "cifar10_main.py", line 538, in <module>
+ tf.app.run()
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/platform/app.py", line 48, in run
+ _sys.exit(main(_sys.argv[:1] + flags_passthrough))
+File "cifar10_main.py", line 518, in main
+ hooks), run_config=config)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 210, in run
+ return _execute_schedule(experiment, schedule)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/learn_runner.py", line 47, in _execute_schedule
+ return task()
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 501, in train_and_evaluate
+ hooks=self._eval_hooks)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/contrib/learn/python/learn/experiment.py", line 681, in _call_evaluate
+ hooks=hooks)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 292, in evaluate
+ name=name)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 638, in _evaluate_model
+ features, labels, model_fn_lib.ModeKeys.EVAL)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/estimator/estimator.py", line 545, in _call_model_fn
+ features=features, labels=labels, **kwargs)
+File "cifar10_main.py", line 331, in _resnet_model_fn
+ gradvars, global_step=tf.train.get_global_step())
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/training/sync_replicas_optimizer.py", line 252, in apply_gradients
+ variables.global_variables())
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 170, in wrapped
+ return _add_should_use_warning(fn(*args, **kwargs))
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 139, in _add_should_use_warning
+ wrapped = TFShouldUseWarningWrapper(x)
+File "/usr/local/lib/python2.7/dist-packages/tensorflow/python/util/tf_should_use.py", line 96, in __init__
+ stack = [s.strip() for s in traceback.format_stack()]
+```
+
+This should not affect your training, and should be fixed on the next releases.
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py
new file mode 100644
index 0000000..6903e8d
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10.py
@@ -0,0 +1,113 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CIFAR-10 data set.
+
+See http://www.cs.toronto.edu/~kriz/cifar.html.
+"""
+import os
+
+import tensorflow as tf
+
+HEIGHT = 32
+WIDTH = 32
+DEPTH = 3
+
+
+class Cifar10DataSet(object):
+ """Cifar10 data set.
+
+ Described by http://www.cs.toronto.edu/~kriz/cifar.html.
+ """
+
+ def __init__(self, data_dir, subset='train', use_distortion=True):
+ self.data_dir = data_dir
+ self.subset = subset
+ self.use_distortion = use_distortion
+
+ def get_filenames(self):
+ if self.subset in ['train', 'validation', 'eval']:
+ return [os.path.join(self.data_dir, self.subset + '.tfrecords')]
+ else:
+ raise ValueError('Invalid data subset "%s"' % self.subset)
+
+ def parser(self, serialized_example):
+ """Parses a single tf.Example into image and label tensors."""
+ # Dimensions of the images in the CIFAR-10 dataset.
+ # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
+ # input format.
+ features = tf.parse_single_example(
+ serialized_example,
+ features={
+ 'image': tf.FixedLenFeature([], tf.string),
+ 'label': tf.FixedLenFeature([], tf.int64),
+ })
+ image = tf.decode_raw(features['image'], tf.uint8)
+ image.set_shape([DEPTH * HEIGHT * WIDTH])
+
+ # Reshape from [depth * height * width] to [depth, height, width].
+ image = tf.cast(
+ tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
+ tf.float32)
+ label = tf.cast(features['label'], tf.int32)
+
+ # Custom preprocessing.
+ image = self.preprocess(image)
+
+ return image, label
+
+ def make_batch(self, batch_size):
+ """Read the images and labels from 'filenames'."""
+ filenames = self.get_filenames()
+ # Repeat infinitely.
+ dataset = tf.data.TFRecordDataset(filenames).repeat()
+
+ # Parse records.
+ dataset = dataset.map(
+ self.parser)
+
+ # Potentially shuffle records.
+ if self.subset == 'train':
+ min_queue_examples = int(
+ Cifar10DataSet.num_examples_per_epoch(self.subset) * 0.4)
+ # Ensure that the capacity is sufficiently large to provide good random
+ # shuffling.
+ dataset = dataset.shuffle(buffer_size=min_queue_examples + 3 * batch_size)
+
+ # Batch it up.
+ dataset = dataset.batch(batch_size)
+ iterator = dataset.make_one_shot_iterator()
+ image_batch, label_batch = iterator.get_next()
+
+ return image_batch, label_batch
+
+ def preprocess(self, image):
+ """Preprocess a single image in [height, width, depth] layout."""
+ if self.subset == 'train' and self.use_distortion:
+ # Pad 4 pixels on each dimension of feature map, done in mini-batch
+ image = tf.image.resize_image_with_crop_or_pad(image, 40, 40)
+ image = tf.random_crop(image, [HEIGHT, WIDTH, DEPTH])
+ image = tf.image.random_flip_left_right(image)
+ return image
+
+ @staticmethod
+ def num_examples_per_epoch(subset='train'):
+ if subset == 'train':
+ return 45000
+ elif subset == 'validation':
+ return 5000
+ elif subset == 'eval':
+ return 10000
+ else:
+ raise ValueError('Invalid data subset "%s"' % subset)
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py
new file mode 100644
index 0000000..086c95b
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_main.py
@@ -0,0 +1,521 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet model for classifying images from CIFAR-10 dataset.
+
+Support single-host training with one or multiple devices.
+
+ResNet as proposed in:
+Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+Deep Residual Learning for Image Recognition. arXiv:1512.03385
+
+CIFAR-10 as in:
+http://www.cs.toronto.edu/~kriz/cifar.html
+
+
+"""
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import functools
+import itertools
+import os
+
+import cifar10
+import cifar10_model
+import cifar10_utils
+import numpy as np
+import six
+from six.moves import xrange # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+tf.logging.set_verbosity(tf.logging.INFO)
+
+
+def get_model_fn(num_gpus, variable_strategy, num_workers):
+ """Returns a function that will build the resnet model."""
+
+ def _resnet_model_fn(features, labels, mode, params):
+ """Resnet model body.
+
+ Support single host, one or more GPU training. Parameter distribution can
+ be either one of the following scheme.
+ 1. CPU is the parameter server and manages gradient updates.
+ 2. Parameters are distributed evenly across all GPUs, and the first GPU
+ manages gradient updates.
+
+ Args:
+ features: a list of tensors, one for each tower
+ labels: a list of tensors, one for each tower
+ mode: ModeKeys.TRAIN or EVAL
+ params: Hyperparameters suitable for tuning
+ Returns:
+ A EstimatorSpec object.
+ """
+ is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+ weight_decay = params.weight_decay
+ momentum = params.momentum
+
+ tower_features = features
+ tower_labels = labels
+ tower_losses = []
+ tower_gradvars = []
+ tower_preds = []
+
+ # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
+ # on CPU. The exception is Intel MKL on CPU which is optimal with
+ # channels_last.
+ data_format = params.data_format
+ if not data_format:
+ if num_gpus == 0:
+ data_format = 'channels_last'
+ else:
+ data_format = 'channels_first'
+
+ if num_gpus == 0:
+ num_devices = 1
+ device_type = 'cpu'
+ else:
+ num_devices = num_gpus
+ device_type = 'gpu'
+
+ for i in range(num_devices):
+ worker_device = '/{}:{}'.format(device_type, i)
+ if variable_strategy == 'CPU':
+ device_setter = cifar10_utils.local_device_setter(
+ worker_device=worker_device)
+ elif variable_strategy == 'GPU':
+ device_setter = cifar10_utils.local_device_setter(
+ ps_device_type='gpu',
+ worker_device=worker_device,
+ ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
+ num_gpus, tf.contrib.training.byte_size_load_fn))
+ with tf.variable_scope('resnet', reuse=bool(i != 0)):
+ with tf.name_scope('tower_%d' % i) as name_scope:
+ with tf.device(device_setter):
+ loss, gradvars, preds = _tower_fn(
+ is_training, weight_decay, tower_features[i], tower_labels[i],
+ data_format, params.num_layers, params.batch_norm_decay,
+ params.batch_norm_epsilon)
+ tower_losses.append(loss)
+ tower_gradvars.append(gradvars)
+ tower_preds.append(preds)
+ if i == 0:
+ # Only trigger batch_norm moving mean and variance update from
+ # the 1st tower. Ideally, we should grab the updates from all
+ # towers but these stats accumulate extremely fast so we can
+ # ignore the other stats from the other towers without
+ # significant detriment.
+ update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
+ name_scope)
+
+ # Now compute global loss and gradients.
+ gradvars = []
+ with tf.name_scope('gradient_averaging'):
+ all_grads = {}
+ for grad, var in itertools.chain(*tower_gradvars):
+ if grad is not None:
+ all_grads.setdefault(var, []).append(grad)
+ for var, grads in six.iteritems(all_grads):
+ # Average gradients on the same device as the variables
+ # to which they apply.
+ with tf.device(var.device):
+ if len(grads) == 1:
+ avg_grad = grads[0]
+ else:
+ avg_grad = tf.multiply(tf.add_n(grads), 1. / len(grads))
+ gradvars.append((avg_grad, var))
+
+ # Device that runs the ops to apply global gradient updates.
+ consolidation_device = '/gpu:0' if variable_strategy == 'GPU' else '/cpu:0'
+ with tf.device(consolidation_device):
+ # Suggested learning rate scheduling from
+ # https://github.com/ppwwyyxx/tensorpack/blob/master/examples/ResNet/cifar10-resnet.py#L155
+ num_batches_per_epoch = cifar10.Cifar10DataSet.num_examples_per_epoch(
+ 'train') // (params.train_batch_size * num_workers)
+ boundaries = [
+ num_batches_per_epoch * x
+ for x in np.array([82, 123, 300], dtype=np.int64)
+ ]
+ staged_lr = [params.learning_rate * x for x in [1, 0.1, 0.01, 0.002]]
+
+ learning_rate = tf.train.piecewise_constant(tf.train.get_global_step(),
+ boundaries, staged_lr)
+
+ loss = tf.reduce_mean(tower_losses, name='loss')
+
+ examples_sec_hook = cifar10_utils.ExamplesPerSecondHook(
+ params.train_batch_size, every_n_steps=10)
+
+ tensors_to_log = {'learning_rate': learning_rate, 'loss': loss}
+
+ logging_hook = tf.train.LoggingTensorHook(
+ tensors=tensors_to_log, every_n_iter=100)
+
+ train_hooks = [logging_hook, examples_sec_hook]
+
+ optimizer = tf.train.MomentumOptimizer(
+ learning_rate=learning_rate, momentum=momentum)
+
+ if params.sync:
+ optimizer = tf.train.SyncReplicasOptimizer(
+ optimizer, replicas_to_aggregate=num_workers)
+ sync_replicas_hook = optimizer.make_session_run_hook(params.is_chief)
+ train_hooks.append(sync_replicas_hook)
+
+ # Create single grouped train op
+ train_op = [
+ optimizer.apply_gradients(
+ gradvars, global_step=tf.train.get_global_step())
+ ]
+ train_op.extend(update_ops)
+ train_op = tf.group(*train_op)
+
+ predictions = {
+ 'classes':
+ tf.concat([p['classes'] for p in tower_preds], axis=0),
+ 'probabilities':
+ tf.concat([p['probabilities'] for p in tower_preds], axis=0)
+ }
+ stacked_labels = tf.concat(labels, axis=0)
+ metrics = {
+ 'accuracy':
+ tf.metrics.accuracy(stacked_labels, predictions['classes'])
+ }
+
+ return tf.estimator.EstimatorSpec(
+ mode=mode,
+ predictions=predictions,
+ loss=loss,
+ train_op=train_op,
+ training_hooks=train_hooks,
+ eval_metric_ops=metrics)
+
+ return _resnet_model_fn
+
+
+def _tower_fn(is_training, weight_decay, feature, label, data_format,
+ num_layers, batch_norm_decay, batch_norm_epsilon):
+ """Build computation tower (Resnet).
+
+ Args:
+ is_training: true if is training graph.
+ weight_decay: weight regularization strength, a float.
+ feature: a Tensor.
+ label: a Tensor.
+ data_format: channels_last (NHWC) or channels_first (NCHW).
+ num_layers: number of layers, an int.
+ batch_norm_decay: decay for batch normalization, a float.
+ batch_norm_epsilon: epsilon for batch normalization, a float.
+
+ Returns:
+ A tuple with the loss for the tower, the gradients and parameters, and
+ predictions.
+
+ """
+ model = cifar10_model.ResNetCifar10(
+ num_layers,
+ batch_norm_decay=batch_norm_decay,
+ batch_norm_epsilon=batch_norm_epsilon,
+ is_training=is_training,
+ data_format=data_format)
+ logits = model.forward_pass(feature, input_data_format='channels_last')
+ tower_pred = {
+ 'classes': tf.argmax(input=logits, axis=1),
+ 'probabilities': tf.nn.softmax(logits)
+ }
+
+ tower_loss = tf.losses.sparse_softmax_cross_entropy(
+ logits=logits, labels=label)
+ tower_loss = tf.reduce_mean(tower_loss)
+
+ model_params = tf.trainable_variables()
+ tower_loss += weight_decay * tf.add_n(
+ [tf.nn.l2_loss(v) for v in model_params])
+
+ tower_grad = tf.gradients(tower_loss, model_params)
+
+ return tower_loss, zip(tower_grad, model_params), tower_pred
+
+
+def input_fn(data_dir,
+ subset,
+ num_shards,
+ batch_size,
+ use_distortion_for_training=True):
+ """Create input graph for model.
+
+ Args:
+ data_dir: Directory where TFRecords representing the dataset are located.
+ subset: one of 'train', 'validate' and 'eval'.
+ num_shards: num of towers participating in data-parallel training.
+ batch_size: total batch size for training to be divided by the number of
+ shards.
+ use_distortion_for_training: True to use distortions.
+ Returns:
+ two lists of tensors for features and labels, each of num_shards length.
+ """
+ with tf.device('/cpu:0'):
+ use_distortion = subset == 'train' and use_distortion_for_training
+ dataset = cifar10.Cifar10DataSet(data_dir, subset, use_distortion)
+ image_batch, label_batch = dataset.make_batch(batch_size)
+ if num_shards <= 1:
+ # No GPU available or only 1 GPU.
+ return [image_batch], [label_batch]
+
+ # Note that passing num=batch_size is safe here, even though
+ # dataset.batch(batch_size) can, in some cases, return fewer than batch_size
+ # examples. This is because it does so only when repeating for a limited
+ # number of epochs, but our dataset repeats forever.
+ image_batch = tf.unstack(image_batch, num=batch_size, axis=0)
+ label_batch = tf.unstack(label_batch, num=batch_size, axis=0)
+ feature_shards = [[] for i in range(num_shards)]
+ label_shards = [[] for i in range(num_shards)]
+ for i in xrange(batch_size):
+ idx = i % num_shards
+ feature_shards[idx].append(image_batch[i])
+ label_shards[idx].append(label_batch[i])
+ feature_shards = [tf.parallel_stack(x) for x in feature_shards]
+ label_shards = [tf.parallel_stack(x) for x in label_shards]
+ return feature_shards, label_shards
+
+
+def get_experiment_fn(data_dir,
+ num_gpus,
+ variable_strategy,
+ use_distortion_for_training=True):
+ """Returns an Experiment function.
+
+ Experiments perform training on several workers in parallel,
+ in other words experiments know how to invoke train and eval in a sensible
+ fashion for distributed training. Arguments passed directly to this
+ function are not tunable, all other arguments should be passed within
+ tf.HParams, passed to the enclosed function.
+
+ Args:
+ data_dir: str. Location of the data for input_fns.
+ num_gpus: int. Number of GPUs on each worker.
+ variable_strategy: String. CPU to use CPU as the parameter server
+ and GPU to use the GPUs as the parameter server.
+ use_distortion_for_training: bool. See cifar10.Cifar10DataSet.
+ Returns:
+ A function (tf.estimator.RunConfig, tf.contrib.training.HParams) ->
+ tf.contrib.learn.Experiment.
+
+ Suitable for use by tf.contrib.learn.learn_runner, which will run various
+ methods on Experiment (train, evaluate) based on information
+ about the current runner in `run_config`.
+ """
+
+ def _experiment_fn(run_config, hparams):
+ """Returns an Experiment."""
+ # Create estimator.
+ train_input_fn = functools.partial(
+ input_fn,
+ data_dir,
+ subset='train',
+ num_shards=num_gpus,
+ batch_size=hparams.train_batch_size,
+ use_distortion_for_training=use_distortion_for_training)
+
+ eval_input_fn = functools.partial(
+ input_fn,
+ data_dir,
+ subset='eval',
+ batch_size=hparams.eval_batch_size,
+ num_shards=num_gpus)
+
+ num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
+ if num_eval_examples % hparams.eval_batch_size != 0:
+ raise ValueError(
+ 'validation set size must be multiple of eval_batch_size')
+
+ train_steps = hparams.train_steps
+ eval_steps = num_eval_examples // hparams.eval_batch_size
+
+ classifier = tf.estimator.Estimator(
+ model_fn=get_model_fn(num_gpus, variable_strategy,
+ run_config.num_worker_replicas or 1),
+ config=run_config,
+ params=hparams)
+
+ # Create experiment.
+ return tf.contrib.learn.Experiment(
+ classifier,
+ train_input_fn=train_input_fn,
+ eval_input_fn=eval_input_fn,
+ train_steps=train_steps,
+ eval_steps=eval_steps)
+
+ return _experiment_fn
+
+
+def main(job_dir, data_dir, num_gpus, variable_strategy,
+ use_distortion_for_training, log_device_placement, num_intra_threads,
+ **hparams):
+ # The env variable is on deprecation path, default is set to off.
+ os.environ['TF_SYNC_ON_FINISH'] = '0'
+ os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
+
+ # Session configuration.
+ sess_config = tf.ConfigProto(
+ allow_soft_placement=True,
+ log_device_placement=log_device_placement,
+ intra_op_parallelism_threads=num_intra_threads,
+ gpu_options=tf.GPUOptions(force_gpu_compatible=True))
+
+ config = cifar10_utils.RunConfig(
+ session_config=sess_config, model_dir=job_dir)
+ tf.contrib.learn.learn_runner.run(
+ get_experiment_fn(data_dir, num_gpus, variable_strategy,
+ use_distortion_for_training),
+ run_config=config,
+ hparams=tf.contrib.training.HParams(
+ is_chief=config.is_chief,
+ **hparams))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data-dir',
+ type=str,
+ required=True,
+ help='The directory where the CIFAR-10 input data is stored.')
+ parser.add_argument(
+ '--job-dir',
+ type=str,
+ required=True,
+ help='The directory where the model will be stored.')
+ parser.add_argument(
+ '--variable-strategy',
+ choices=['CPU', 'GPU'],
+ type=str,
+ default='CPU',
+ help='Where to locate variable operations')
+ parser.add_argument(
+ '--num-gpus',
+ type=int,
+ default=1,
+ help='The number of gpus used. Uses only CPU if set to 0.')
+ parser.add_argument(
+ '--num-layers',
+ type=int,
+ default=44,
+ help='The number of layers of the model.')
+ parser.add_argument(
+ '--train-steps',
+ type=int,
+ default=80000,
+ help='The number of steps to use for training.')
+ parser.add_argument(
+ '--train-batch-size',
+ type=int,
+ default=128,
+ help='Batch size for training.')
+ parser.add_argument(
+ '--eval-batch-size',
+ type=int,
+ default=100,
+ help='Batch size for validation.')
+ parser.add_argument(
+ '--momentum',
+ type=float,
+ default=0.9,
+ help='Momentum for MomentumOptimizer.')
+ parser.add_argument(
+ '--weight-decay',
+ type=float,
+ default=2e-4,
+ help='Weight decay for convolutions.')
+ parser.add_argument(
+ '--learning-rate',
+ type=float,
+ default=0.1,
+ help="""\
+ This is the inital learning rate value. The learning rate will decrease
+ during training. For more details check the model_fn implementation in
+ this file.\
+ """)
+ parser.add_argument(
+ '--use-distortion-for-training',
+ type=bool,
+ default=True,
+ help='If doing image distortion for training.')
+ parser.add_argument(
+ '--sync',
+ action='store_true',
+ default=False,
+ help="""\
+ If present when running in a distributed environment will run on sync mode.\
+ """)
+ parser.add_argument(
+ '--num-intra-threads',
+ type=int,
+ default=0,
+ help="""\
+ Number of threads to use for intra-op parallelism. When training on CPU
+ set to 0 to have the system pick the appropriate number or alternatively
+ set it to the number of physical CPU cores.\
+ """)
+ parser.add_argument(
+ '--num-inter-threads',
+ type=int,
+ default=0,
+ help="""\
+ Number of threads to use for inter-op parallelism. If set to 0, the
+ system will pick an appropriate number.\
+ """)
+ parser.add_argument(
+ '--data-format',
+ type=str,
+ default=None,
+ help="""\
+ If not set, the data format best for the training device is used.
+ Allowed values: channels_first (NCHW) channels_last (NHWC).\
+ """)
+ parser.add_argument(
+ '--log-device-placement',
+ action='store_true',
+ default=False,
+ help='Whether to log device placement.')
+ parser.add_argument(
+ '--batch-norm-decay',
+ type=float,
+ default=0.997,
+ help='Decay for batch norm.')
+ parser.add_argument(
+ '--batch-norm-epsilon',
+ type=float,
+ default=1e-5,
+ help='Epsilon for batch norm.')
+ args = parser.parse_args()
+
+ if args.num_gpus > 0:
+ assert tf.test.is_gpu_available(), "Requested GPUs but none found."
+ if args.num_gpus < 0:
+ raise ValueError(
+ 'Invalid GPU count: \"--num-gpus\" must be 0 or a positive integer.')
+ if args.num_gpus == 0 and args.variable_strategy == 'GPU':
+ raise ValueError('num-gpus=0, CPU must be used as parameter server. Set'
+ '--variable-strategy=CPU.')
+ if (args.num_layers - 2) % 6 != 0:
+ raise ValueError('Invalid --num-layers parameter.')
+ if args.num_gpus != 0 and args.train_batch_size % args.num_gpus != 0:
+ raise ValueError('--train-batch-size must be multiple of --num-gpus.')
+ if args.num_gpus != 0 and args.eval_batch_size % args.num_gpus != 0:
+ raise ValueError('--eval-batch-size must be multiple of --num-gpus.')
+
+ main(**vars(args))
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py
new file mode 100644
index 0000000..d67c233
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_model.py
@@ -0,0 +1,80 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model class for Cifar10 Dataset."""
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+import model_base
+
+
+class ResNetCifar10(model_base.ResNet):
+ """Cifar10 model with ResNetV1 and basic residual block."""
+
+ def __init__(self,
+ num_layers,
+ is_training,
+ batch_norm_decay,
+ batch_norm_epsilon,
+ data_format='channels_first'):
+ super(ResNetCifar10, self).__init__(
+ is_training,
+ data_format,
+ batch_norm_decay,
+ batch_norm_epsilon
+ )
+ self.n = (num_layers - 2) // 6
+ # Add one in case label starts with 1. No impact if label starts with 0.
+ self.num_classes = 10 + 1
+ self.filters = [16, 16, 32, 64]
+ self.strides = [1, 2, 2]
+
+ def forward_pass(self, x, input_data_format='channels_last'):
+ """Build the core model within the graph."""
+ if self._data_format != input_data_format:
+ if input_data_format == 'channels_last':
+ # Computation requires channels_first.
+ x = tf.transpose(x, [0, 3, 1, 2])
+ else:
+ # Computation requires channels_last.
+ x = tf.transpose(x, [0, 2, 3, 1])
+
+ # Image standardization.
+ x = x / 128 - 1
+
+ x = self._conv(x, 3, 16, 1)
+ x = self._batch_norm(x)
+ x = self._relu(x)
+
+ # Use basic (non-bottleneck) block and ResNet V1 (post-activation).
+ res_func = self._residual_v1
+
+ # 3 stages of block stacking.
+ for i in range(3):
+ with tf.name_scope('stage'):
+ for j in range(self.n):
+ if j == 0:
+ # First block in a stage, filters and strides may change.
+ x = res_func(x, 3, self.filters[i], self.filters[i + 1],
+ self.strides[i])
+ else:
+ # Following blocks in a stage, constant filters and unit stride.
+ x = res_func(x, 3, self.filters[i + 1], self.filters[i + 1], 1)
+
+ x = self._global_avg_pool(x)
+ x = self._fully_connected(x, self.num_classes)
+
+ return x
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py
new file mode 100644
index 0000000..7ecb50a
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/cifar10_utils.py
@@ -0,0 +1,154 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import collections
+import six
+
+import tensorflow as tf
+
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+from tensorflow.python.training import device_setter
+from tensorflow.contrib.learn.python.learn import run_config
+
+
+# TODO(b/64848083) Remove once uid bug is fixed
+class RunConfig(tf.contrib.learn.RunConfig):
+ def uid(self, whitelist=None):
+ """Generates a 'Unique Identifier' based on all internal fields.
+ Caller should use the uid string to check `RunConfig` instance integrity
+ in one session use, but should not rely on the implementation details, which
+ is subject to change.
+ Args:
+ whitelist: A list of the string names of the properties uid should not
+ include. If `None`, defaults to `_DEFAULT_UID_WHITE_LIST`, which
+ includes most properties user allowes to change.
+ Returns:
+ A uid string.
+ """
+ if whitelist is None:
+ whitelist = run_config._DEFAULT_UID_WHITE_LIST
+
+ state = {k: v for k, v in self.__dict__.items() if not k.startswith('__')}
+ # Pop out the keys in whitelist.
+ for k in whitelist:
+ state.pop('_' + k, None)
+
+ ordered_state = collections.OrderedDict(
+ sorted(state.items(), key=lambda t: t[0]))
+ # For class instance without __repr__, some special cares are required.
+ # Otherwise, the object address will be used.
+ if '_cluster_spec' in ordered_state:
+ ordered_state['_cluster_spec'] = collections.OrderedDict(
+ sorted(ordered_state['_cluster_spec'].as_dict().items(),
+ key=lambda t: t[0])
+ )
+ return ', '.join(
+ '%s=%r' % (k, v) for (k, v) in six.iteritems(ordered_state))
+
+
+class ExamplesPerSecondHook(session_run_hook.SessionRunHook):
+ """Hook to print out examples per second.
+
+ Total time is tracked and then divided by the total number of steps
+ to get the average step time and then batch_size is used to determine
+ the running average of examples per second. The examples per second for the
+ most recent interval is also logged.
+ """
+
+ def __init__(
+ self,
+ batch_size,
+ every_n_steps=100,
+ every_n_secs=None,):
+ """Initializer for ExamplesPerSecondHook.
+
+ Args:
+ batch_size: Total batch size used to calculate examples/second from
+ global time.
+ every_n_steps: Log stats every n steps.
+ every_n_secs: Log stats every n seconds.
+ """
+ if (every_n_steps is None) == (every_n_secs is None):
+ raise ValueError('exactly one of every_n_steps'
+ ' and every_n_secs should be provided.')
+ self._timer = basic_session_run_hooks.SecondOrStepTimer(
+ every_steps=every_n_steps, every_secs=every_n_secs)
+
+ self._step_train_time = 0
+ self._total_steps = 0
+ self._batch_size = batch_size
+
+ def begin(self):
+ self._global_step_tensor = training_util.get_global_step()
+ if self._global_step_tensor is None:
+ raise RuntimeError(
+ 'Global step should be created to use StepCounterHook.')
+
+ def before_run(self, run_context): # pylint: disable=unused-argument
+ return basic_session_run_hooks.SessionRunArgs(self._global_step_tensor)
+
+ def after_run(self, run_context, run_values):
+ _ = run_context
+
+ global_step = run_values.results
+ if self._timer.should_trigger_for_step(global_step):
+ elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(
+ global_step)
+ if elapsed_time is not None:
+ steps_per_sec = elapsed_steps / elapsed_time
+ self._step_train_time += elapsed_time
+ self._total_steps += elapsed_steps
+
+ average_examples_per_sec = self._batch_size * (
+ self._total_steps / self._step_train_time)
+ current_examples_per_sec = steps_per_sec * self._batch_size
+ # Average examples/sec followed by current examples/sec
+ logging.info('%s: %g (%g), step = %g', 'Average examples/sec',
+ average_examples_per_sec, current_examples_per_sec,
+ self._total_steps)
+
+def local_device_setter(num_devices=1,
+ ps_device_type='cpu',
+ worker_device='/cpu:0',
+ ps_ops=None,
+ ps_strategy=None):
+ if ps_ops == None:
+ ps_ops = ['Variable', 'VariableV2', 'VarHandleOp']
+
+ if ps_strategy is None:
+ ps_strategy = device_setter._RoundRobinStrategy(num_devices)
+ if not six.callable(ps_strategy):
+ raise TypeError("ps_strategy must be callable")
+
+ def _local_device_chooser(op):
+ current_device = pydev.DeviceSpec.from_string(op.device or "")
+
+ node_def = op if isinstance(op, node_def_pb2.NodeDef) else op.node_def
+ if node_def.op in ps_ops:
+ ps_device_spec = pydev.DeviceSpec.from_string(
+ '/{}:{}'.format(ps_device_type, ps_strategy(op)))
+
+ ps_device_spec.merge_from(current_device)
+ return ps_device_spec.to_string()
+ else:
+ worker_device_spec = pydev.DeviceSpec.from_string(worker_device or "")
+ worker_device_spec.merge_from(current_device)
+ return worker_device_spec.to_string()
+ return _local_device_chooser
http://git-wip-us.apache.org/repos/asf/hadoop/blob/19ad5be6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py
new file mode 100644
index 0000000..409cee4
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/src/main/docker/with-cifar10-models/ubuntu-16.04/cifar10_estimator_tf_1.8.0/generate_cifar10_tfrecords.py
@@ -0,0 +1,114 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Read CIFAR-10 data from pickled numpy arrays and writes TFRecords.
+
+Generates tf.train.Example protos and writes them to TFRecord files from the
+python version of the CIFAR-10 dataset downloaded from
+https://www.cs.toronto.edu/~kriz/cifar.html.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+
+import tarfile
+from six.moves import cPickle as pickle
+from six.moves import xrange # pylint: disable=redefined-builtin
+import tensorflow as tf
+
+CIFAR_FILENAME = 'cifar-10-python.tar.gz'
+CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME
+CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py'
+
+
+def download_and_extract(data_dir):
+ # download CIFAR-10 if not already downloaded.
+ tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir,
+ CIFAR_DOWNLOAD_URL)
+ tarfile.open(os.path.join(data_dir, CIFAR_FILENAME),
+ 'r:gz').extractall(data_dir)
+
+
+def _int64_feature(value):
+ return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
+
+
+def _bytes_feature(value):
+ return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def _get_file_names():
+ """Returns the file names expected to exist in the input_dir."""
+ file_names = {}
+ file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)]
+ file_names['validation'] = ['data_batch_5']
+ file_names['eval'] = ['test_batch']
+ return file_names
+
+
+def read_pickle_from_file(filename):
+ with tf.gfile.Open(filename, 'rb') as f:
+ data_dict = pickle.load(f)
+ return data_dict
+
+
+def convert_to_tfrecord(input_files, output_file):
+ """Converts a file to TFRecords."""
+ print('Generating %s' % output_file)
+ with tf.python_io.TFRecordWriter(output_file) as record_writer:
+ for input_file in input_files:
+ data_dict = read_pickle_from_file(input_file)
+ data = data_dict['data']
+ labels = data_dict['labels']
+ num_entries_in_batch = len(labels)
+ for i in range(num_entries_in_batch):
+ example = tf.train.Example(features=tf.train.Features(
+ feature={
+ 'image': _bytes_feature(data[i].tobytes()),
+ 'label': _int64_feature(labels[i])
+ }))
+ record_writer.write(example.SerializeToString())
+
+
+def main(data_dir):
+ print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL))
+ download_and_extract(data_dir)
+ file_names = _get_file_names()
+ input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER)
+ for mode, files in file_names.items():
+ input_files = [os.path.join(input_dir, f) for f in files]
+ output_file = os.path.join(data_dir, mode + '.tfrecords')
+ try:
+ os.remove(output_file)
+ except OSError:
+ pass
+ # Convert to tf.train.Example and write the to TFRecords.
+ convert_to_tfrecord(input_files, output_file)
+ print('Done!')
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ '--data-dir',
+ type=str,
+ default='',
+ help='Directory to download and extract CIFAR-10 to.')
+
+ args = parser.parse_args()
+ main(args.data_dir)
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org