You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@submarine.apache.org by li...@apache.org on 2020/03/05 01:10:29 UTC
[submarine] branch master updated: SUBMARINE-337. MXNET example in
mini-submarine
This is an automated email from the ASF dual-hosted git repository.
liuxun pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/submarine.git
The following commit(s) were added to refs/heads/master by this push:
new a2d9715 SUBMARINE-337. MXNET example in mini-submarine
a2d9715 is described below
commit a2d9715ed1517211ec2783f5c0b12998a3ace2cc
Author: Ryan Lo <lo...@gmail.com>
AuthorDate: Tue Mar 3 12:26:09 2020 +0800
SUBMARINE-337. MXNET example in mini-submarine
### What is this PR for?
SUBMARINE-337. Add a MXNET example in mini-submarine
### What type of PR is it?
[ Improvement ]
### Todos
* [ ] - Task
### What is the Jira issue?
[SUBMARINE-337](https://issues.apache.org/jira/projects/SUBMARINE/issues/SUBMARINE-337)
### How should this be tested?
[passed CI](https://travis-ci.org/lowc1012/submarine/builds/657377080)
### Screenshots (if appropriate)
![螢幕快照 2020-03-02 下午10 17 11](https://user-images.githubusercontent.com/52355146/75737512-e1713300-5d3a-11ea-9db3-88a214d94290.png)
### Questions:
* Does the licenses files need update? No
* Is there breaking changes for older versions? No
* Does this needs documentation? No
Author: Ryan Lo <lo...@gmail.com>
Closes #200 from lowc1012/SUBMARINE-337 and squashes the following commits:
4c5a741 [Ryan Lo] SUBMARINE-337. remove and update some comments
9cbf53f [Ryan Lo] SUBMARINE-337. MXNET example in mini-submarine
---
dev-support/mini-submarine/Dockerfile | 5 +
.../submarine/build_python_virtual_env.sh | 1 +
.../submarine/image_classification.py | 465 +++++++++++++++++++++
.../submarine/run_submarine_mxnet_cifar10_tony.sh | 63 +++
4 files changed, 534 insertions(+)
diff --git a/dev-support/mini-submarine/Dockerfile b/dev-support/mini-submarine/Dockerfile
index face9c7..dbe172b 100644
--- a/dev-support/mini-submarine/Dockerfile
+++ b/dev-support/mini-submarine/Dockerfile
@@ -38,6 +38,11 @@ RUN \
apt-get update && \
apt-get -y install vim
+# Install libgomp1 for MXNet
+RUN \
+ apt-get update && \
+ apt-get -y install libgomp1
+
#INSTALL HADOOP
# Add native libs
ARG HADOOP_VERSION=
diff --git a/dev-support/mini-submarine/submarine/build_python_virtual_env.sh b/dev-support/mini-submarine/submarine/build_python_virtual_env.sh
index 6ecd1c8..0596603 100755
--- a/dev-support/mini-submarine/submarine/build_python_virtual_env.sh
+++ b/dev-support/mini-submarine/submarine/build_python_virtual_env.sh
@@ -23,6 +23,7 @@ python3 virtualenv-16.0.0/virtualenv.py venv
pip3 install tensorflow==1.13.1
pip3 install torch==0.4.1
pip3 install torchvision==0.1.8
+pip3 install mxnet==1.5.1
pip3 install /opt/pysubmarine/.
zip -r myvenv.zip venv
deactivate
diff --git a/dev-support/mini-submarine/submarine/image_classification.py b/dev-support/mini-submarine/submarine/image_classification.py
new file mode 100644
index 0000000..14eb68c
--- /dev/null
+++ b/dev-support/mini-submarine/submarine/image_classification.py
@@ -0,0 +1,465 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import argparse
+import time
+import os
+import logging
+import random
+import tarfile
+
+import mxnet as mx
+from mxnet import gluon
+from mxnet import profiler
+from mxnet.gluon import nn
+from mxnet.gluon.model_zoo import vision as models
+from mxnet.gluon.data.vision import ImageFolderDataset
+from mxnet.gluon.data import DataLoader
+from mxnet.contrib.io import DataLoaderIter
+from mxnet import autograd as ag
+from mxnet.test_utils import get_mnist_iterator, get_cifar10
+from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric
+import numpy as np
+
+# logging
+logging.basicConfig(level=logging.INFO)
+fh = logging.FileHandler('image-classification.log')
+logger = logging.getLogger()
+logger.addHandler(fh)
+formatter = logging.Formatter('%(message)s')
+fh.setFormatter(formatter)
+fh.setLevel(logging.DEBUG)
+logging.debug('\n%s', '-' * 100)
+formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
+fh.setFormatter(formatter)
+
+# CLI
+parser = argparse.ArgumentParser(description='Train a model for image classification.')
+parser.add_argument('--dataset', type=str, default='cifar10',
+ help='dataset to use. options are mnist, cifar10, caltech101, imagenet and dummy.')
+parser.add_argument('--data-dir', type=str, default='',
+ help='training directory of imagenet images, contains train/val subdirs.')
+parser.add_argument('--num-worker', '-j', dest='num_workers', default=4, type=int,
+ help='number of workers for dataloader')
+parser.add_argument('--batch-size', type=int, default=32,
+ help='training batch size per device (CPU/GPU).')
+parser.add_argument('--gpus', type=str, default='',
+ help='ordinates of gpus to use, can be "0,1,2" or empty for cpu only.')
+parser.add_argument('--epochs', type=int, default=120,
+ help='number of training epochs.')
+parser.add_argument('--lr', type=float, default=0.1,
+ help='learning rate. default is 0.1.')
+parser.add_argument('--momentum', type=float, default=0.9,
+ help='momentum value for optimizer, default is 0.9.')
+parser.add_argument('--wd', type=float, default=0.0001,
+ help='weight decay rate. default is 0.0001.')
+parser.add_argument('--seed', type=int, default=123,
+ help='random seed to use. Default=123.')
+parser.add_argument('--mode', type=str,
+ help='mode in which to train the model. options are symbolic, imperative, hybrid')
+parser.add_argument('--model', type=str, required=True,
+ help='type of model to use. see vision_model for options.')
+parser.add_argument('--use_thumbnail', action='store_true',
+ help='use thumbnail or not in resnet. default is false.')
+parser.add_argument('--batch-norm', action='store_true',
+ help='enable batch normalization or not in vgg. default is false.')
+parser.add_argument('--use-pretrained', action='store_true',
+ help='enable using pretrained model from gluon.')
+parser.add_argument('--prefix', default='', type=str,
+ help='path to checkpoint prefix, default is current working dir')
+parser.add_argument('--start-epoch', default=0, type=int,
+ help='starting epoch, 0 for fresh training, > 0 to resume')
+parser.add_argument('--resume', type=str, default='',
+ help='path to saved weight where you want resume')
+parser.add_argument('--lr-factor', default=0.1, type=float,
+ help='learning rate decay ratio')
+parser.add_argument('--lr-steps', default='30,60,90', type=str,
+ help='list of learning rate decay epochs as in str')
+parser.add_argument('--dtype', default='float32', type=str,
+ help='data type, float32 or float16 if applicable')
+parser.add_argument('--save-frequency', default=10, type=int,
+ help='epoch frequence to save model, best model will always be saved')
+parser.add_argument('--kvstore', type=str, default='device',
+ help='kvstore to use for trainer/module.')
+parser.add_argument('--log-interval', type=int, default=50,
+ help='Number of batches to wait before logging.')
+parser.add_argument('--profile', action='store_true',
+ help='Option to turn on memory profiling for front-end, '\
+ 'and prints out the memory usage by python function at the end.')
+parser.add_argument('--builtin-profiler', type=int, default=0, help='Enable built-in profiler (0=off, 1=on)')
+opt = parser.parse_args()
+
+# global variables
+logger.info('Starting new image-classification task:, %s',opt)
+mx.random.seed(opt.seed)
+model_name = opt.model
+dataset_classes = {'mnist': 10, 'cifar10': 10, 'caltech101':101, 'imagenet': 1000, 'dummy': 1000}
+batch_size, dataset, classes = opt.batch_size, opt.dataset, dataset_classes[opt.dataset]
+context = [mx.gpu(int(i)) for i in opt.gpus.split(',')] if opt.gpus.strip() else [mx.cpu()]
+num_gpus = len(context)
+batch_size *= max(1, num_gpus)
+lr_steps = [int(x) for x in opt.lr_steps.split(',') if x.strip()]
+metric = CompositeEvalMetric([Accuracy(), TopKAccuracy(5)])
+kv = mx.kv.create(opt.kvstore)
+
+
+def get_cifar10_iterator(batch_size, data_shape, resize=-1, num_parts=1, part_index=0):
+ get_cifar10()
+
+ train = mx.io.ImageRecordIter(
+ path_imgrec = "data/cifar/train.rec",
+ resize = resize,
+ data_shape = data_shape,
+ batch_size = batch_size,
+ rand_crop = True,
+ rand_mirror = True,
+ num_parts=num_parts,
+ part_index=part_index)
+
+ val = mx.io.ImageRecordIter(
+ path_imgrec = "data/cifar/test.rec",
+ resize = resize,
+ rand_crop = False,
+ rand_mirror = False,
+ data_shape = data_shape,
+ batch_size = batch_size,
+ num_parts=num_parts,
+ part_index=part_index)
+
+ return train, val
+
+def get_imagenet_transforms(data_shape=224, dtype='float32'):
+ def train_transform(image, label):
+ image, _ = mx.image.random_size_crop(image, (data_shape, data_shape), 0.08, (3/4., 4/3.))
+ image = mx.nd.image.random_flip_left_right(image)
+ image = mx.nd.image.to_tensor(image)
+ image = mx.nd.image.normalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+ return mx.nd.cast(image, dtype), label
+
+ def val_transform(image, label):
+ image = mx.image.resize_short(image, data_shape + 32)
+ image, _ = mx.image.center_crop(image, (data_shape, data_shape))
+ image = mx.nd.image.to_tensor(image)
+ image = mx.nd.image.normalize(image, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+ return mx.nd.cast(image, dtype), label
+ return train_transform, val_transform
+
+def get_imagenet_iterator(root, batch_size, num_workers, data_shape=224, dtype='float32'):
+ """Dataset loader with preprocessing."""
+ train_dir = os.path.join(root, 'train')
+ train_transform, val_transform = get_imagenet_transforms(data_shape, dtype)
+ logging.info("Loading image folder %s, this may take a bit long...", train_dir)
+ train_dataset = ImageFolderDataset(train_dir, transform=train_transform)
+ train_data = DataLoader(train_dataset, batch_size, shuffle=True,
+ last_batch='discard', num_workers=num_workers)
+ val_dir = os.path.join(root, 'val')
+ if not os.path.isdir(os.path.expanduser(os.path.join(root, 'val', 'n01440764'))):
+ user_warning = 'Make sure validation images are stored in one subdir per category, a helper script is available at https://git.io/vNQv1'
+ raise ValueError(user_warning)
+ logging.info("Loading image folder %s, this may take a bit long...", val_dir)
+ val_dataset = ImageFolderDataset(val_dir, transform=val_transform)
+ val_data = DataLoader(val_dataset, batch_size, last_batch='keep', num_workers=num_workers)
+ return DataLoaderIter(train_data, dtype), DataLoaderIter(val_data, dtype)
+
+def get_caltech101_data():
+ url = "https://s3.us-east-2.amazonaws.com/mxnet-public/101_ObjectCategories.tar.gz"
+ dataset_name = "101_ObjectCategories"
+ data_folder = "data"
+ if not os.path.isdir(data_folder):
+ os.makedirs(data_folder)
+ tar_path = mx.gluon.utils.download(url, path=data_folder)
+ if (not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories")) or
+ not os.path.isdir(os.path.join(data_folder, "101_ObjectCategories_test"))):
+ tar = tarfile.open(tar_path, "r:gz")
+ tar.extractall(data_folder)
+ tar.close()
+ print('Data extracted')
+ training_path = os.path.join(data_folder, dataset_name)
+ testing_path = os.path.join(data_folder, "{}_test".format(dataset_name))
+ return training_path, testing_path
+
+def get_caltech101_iterator(batch_size, num_workers, dtype):
+ def transform(image, label):
+ # resize the shorter edge to 224, the longer edge will be greater or equal to 224
+ resized = mx.image.resize_short(image, 224)
+ # center and crop an area of size (224,224)
+ cropped, crop_info = mx.image.center_crop(resized, (224, 224))
+ # transpose the channels to be (3,224,224)
+ transposed = mx.nd.transpose(cropped, (2, 0, 1))
+ return transposed, label
+
+ training_path, testing_path = get_caltech101_data()
+ dataset_train = ImageFolderDataset(root=training_path, transform=transform)
+ dataset_test = ImageFolderDataset(root=testing_path, transform=transform)
+
+ train_data = DataLoader(dataset_train, batch_size, shuffle=True, num_workers=num_workers)
+ test_data = DataLoader(dataset_test, batch_size, shuffle=False, num_workers=num_workers)
+ return DataLoaderIter(train_data), DataLoaderIter(test_data)
+
+class DummyIter(mx.io.DataIter):
+ def __init__(self, batch_size, data_shape, batches = 100):
+ super(DummyIter, self).__init__(batch_size)
+ self.data_shape = (batch_size,) + data_shape
+ self.label_shape = (batch_size,)
+ self.provide_data = [('data', self.data_shape)]
+ self.provide_label = [('softmax_label', self.label_shape)]
+ self.batch = mx.io.DataBatch(data=[mx.nd.zeros(self.data_shape)],
+ label=[mx.nd.zeros(self.label_shape)])
+ self._batches = 0
+ self.batches = batches
+
+ def next(self):
+ if self._batches < self.batches:
+ self._batches += 1
+ return self.batch
+ else:
+ self._batches = 0
+ raise StopIteration
+
+def dummy_iterator(batch_size, data_shape):
+ return DummyIter(batch_size, data_shape), DummyIter(batch_size, data_shape)
+
+class ImagePairIter(mx.io.DataIter):
+ def __init__(self, path, data_shape, label_shape, batch_size=64, flag=0, input_aug=None, target_aug=None):
+ super(ImagePairIter, self).__init__(batch_size)
+ self.data_shape = (batch_size,) + data_shape
+ self.label_shape = (batch_size,) + label_shape
+ self.input_aug = input_aug
+ self.target_aug = target_aug
+ self.provide_data = [('data', self.data_shape)]
+ self.provide_label = [('label', self.label_shape)]
+ is_image_file = lambda fn: any(fn.endswith(ext) for ext in [".png", ".jpg", ".jpeg"])
+ self.filenames = [os.path.join(path, x) for x in os.listdir(path) if is_image_file(x)]
+ self.count = 0
+ self.flag = flag
+ random.shuffle(self.filenames)
+
+ def next(self):
+ from PIL import Image
+ if self.count + self.batch_size <= len(self.filenames):
+ data = []
+ label = []
+ for i in range(self.batch_size):
+ fn = self.filenames[self.count]
+ self.count += 1
+ image = Image.open(fn).convert('YCbCr').split()[0]
+ if image.size[0] > image.size[1]:
+ image = image.transpose(Image.TRANSPOSE)
+ image = mx.nd.expand_dims(mx.nd.array(image), axis=2)
+ target = image.copy()
+ for aug in self.input_aug:
+ image = aug(image)
+ for aug in self.target_aug:
+ target = aug(target)
+ data.append(image)
+ label.append(target)
+
+ data = mx.nd.concat(*[mx.nd.expand_dims(d, axis=0) for d in data], dim=0)
+ label = mx.nd.concat(*[mx.nd.expand_dims(d, axis=0) for d in label], dim=0)
+ data = [mx.nd.transpose(data, axes=(0, 3, 1, 2)).astype('float32')/255]
+ label = [mx.nd.transpose(label, axes=(0, 3, 1, 2)).astype('float32')/255]
+
+ return mx.io.DataBatch(data=data, label=label)
+ else:
+ raise StopIteration
+
+ def reset(self):
+ self.count = 0
+ random.shuffle(self.filenames)
+
+def get_model(model, ctx, opt):
+ """Model initialization."""
+ kwargs = {'ctx': ctx, 'pretrained': opt.use_pretrained, 'classes': classes}
+ if model.startswith('resnet'):
+ kwargs['thumbnail'] = opt.use_thumbnail
+ elif model.startswith('vgg'):
+ kwargs['batch_norm'] = opt.batch_norm
+
+ net = models.get_model(model, **kwargs)
+ if opt.resume:
+ net.load_parameters(opt.resume)
+ elif not opt.use_pretrained:
+ if model in ['alexnet']:
+ net.initialize(mx.init.Normal())
+ else:
+ net.initialize(mx.init.Xavier(magnitude=2))
+ net.cast(opt.dtype)
+ return net
+
+net = get_model(opt.model, context, opt)
+
+def get_data_iters(dataset, batch_size, opt):
+ """get dataset iterators"""
+ if dataset == 'mnist':
+ train_data, val_data = get_mnist_iterator(batch_size, (1, 28, 28),
+ num_parts=kv.num_workers, part_index=kv.rank)
+ elif dataset == 'cifar10':
+ train_data, val_data = get_cifar10_iterator(batch_size, (3, 32, 32),
+ num_parts=kv.num_workers, part_index=kv.rank)
+ elif dataset == 'imagenet':
+ shape_dim = 299 if model_name == 'inceptionv3' else 224
+
+ if not opt.data_dir:
+ raise ValueError('Dir containing raw images in train/val is required for imagenet.'
+ 'Please specify "--data-dir"')
+
+ train_data, val_data = get_imagenet_iterator(opt.data_dir, batch_size,
+ opt.num_workers, shape_dim, opt.dtype)
+ elif dataset == 'caltech101':
+ train_data, val_data = get_caltech101_iterator(batch_size, opt.num_workers, opt.dtype)
+ elif dataset == 'dummy':
+ shape_dim = 299 if model_name == 'inceptionv3' else 224
+ train_data, val_data = dummy_iterator(batch_size, (3, shape_dim, shape_dim))
+ return train_data, val_data
+
+def test(ctx, val_data):
+ metric.reset()
+ val_data.reset()
+ for batch in val_data:
+ data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype, copy=False),
+ ctx_list=ctx, batch_axis=0)
+ label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype, copy=False),
+ ctx_list=ctx, batch_axis=0)
+ outputs = [net(X) for X in data]
+ metric.update(label, outputs)
+ return metric.get()
+
+def update_learning_rate(lr, trainer, epoch, ratio, steps):
+ """Set the learning rate to the initial value decayed by ratio every N epochs."""
+ new_lr = lr * (ratio ** int(np.sum(np.array(steps) < epoch)))
+ trainer.set_learning_rate(new_lr)
+ return trainer
+
+def save_checkpoint(epoch, top1, best_acc):
+ if opt.save_frequency and (epoch + 1) % opt.save_frequency == 0:
+ fname = os.path.join(opt.prefix, '%s_%d_acc_%.4f.params' % (opt.model, epoch, top1))
+ net.save_parameters(fname)
+ logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
+ if top1 > best_acc[0]:
+ best_acc[0] = top1
+ fname = os.path.join(opt.prefix, '%s_best.params' % (opt.model))
+ net.save_parameters(fname)
+ logger.info('[Epoch %d] Saving checkpoint to %s with Accuracy: %.4f', epoch, fname, top1)
+
+def train(opt, ctx):
+ if isinstance(ctx, mx.Context):
+ ctx = [ctx]
+
+ train_data, val_data = get_data_iters(dataset, batch_size, opt)
+ net.collect_params().reset_ctx(ctx)
+ trainer = gluon.Trainer(net.collect_params(), 'sgd',
+ optimizer_params={'learning_rate': opt.lr,
+ 'wd': opt.wd,
+ 'momentum': opt.momentum,
+ 'multi_precision': True},
+ kvstore=kv)
+ loss = gluon.loss.SoftmaxCrossEntropyLoss()
+
+ total_time = 0
+ num_epochs = 0
+ best_acc = [0]
+ for epoch in range(opt.start_epoch, opt.epochs):
+ trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps)
+ tic = time.time()
+ train_data.reset()
+ metric.reset()
+ btic = time.time()
+ for i, batch in enumerate(train_data):
+ data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
+ label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
+ outputs = []
+ Ls = []
+ with ag.record():
+ for x, y in zip(data, label):
+ z = net(x)
+ L = loss(z, y)
+ # store the loss and do backward after we have done forward
+ # on all GPUs for better speed on multiple GPUs.
+ Ls.append(L)
+ outputs.append(z)
+ ag.backward(Ls)
+ trainer.step(batch.data[0].shape[0])
+ metric.update(label, outputs)
+ if opt.log_interval and not (i+1)%opt.log_interval:
+ name, acc = metric.get()
+ logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'%(
+ epoch, i, batch_size/(time.time()-btic), name[0], acc[0], name[1], acc[1]))
+ btic = time.time()
+
+ epoch_time = time.time()-tic
+
+ # First epoch will usually be much slower than the subsequent epics,
+ # so don't factor into the average
+ if num_epochs > 0:
+ total_time = total_time + epoch_time
+ num_epochs = num_epochs + 1
+
+ name, acc = metric.get()
+ logger.info('[Epoch %d] training: %s=%f, %s=%f'%(epoch, name[0], acc[0], name[1], acc[1]))
+ logger.info('[Epoch %d] time cost: %f'%(epoch, epoch_time))
+ name, val_acc = test(ctx, val_data)
+ logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1]))
+
+ # save model if meet requirements
+ save_checkpoint(epoch, val_acc[0], best_acc)
+ if num_epochs > 1:
+ print('Average epoch time: {}'.format(float(total_time)/(num_epochs - 1)))
+
+def main():
+ if opt.builtin_profiler > 0:
+ profiler.set_config(profile_all=True, aggregate_stats=True)
+ profiler.set_state('run')
+ if opt.mode == 'symbolic':
+ data = mx.sym.var('data')
+ if opt.dtype == 'float16':
+ data = mx.sym.Cast(data=data, dtype=np.float16)
+ out = net(data)
+ if opt.dtype == 'float16':
+ out = mx.sym.Cast(data=out, dtype=np.float32)
+ softmax = mx.sym.SoftmaxOutput(out, name='softmax')
+ mod = mx.mod.Module(softmax, context=context)
+ train_data, val_data = get_data_iters(dataset, batch_size, opt)
+ mod.fit(train_data,
+ eval_data=val_data,
+ num_epoch=opt.epochs,
+ kvstore=kv,
+ batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
+ epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
+ optimizer = 'sgd',
+ optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
+ initializer = mx.init.Xavier(magnitude=2))
+ mod.save_parameters('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
+ else:
+ if opt.mode == 'hybrid':
+ net.hybridize()
+ train(opt, context)
+ if opt.builtin_profiler > 0:
+ profiler.set_state('stop')
+ print(profiler.dumps())
+
+if __name__ == '__main__':
+ if opt.profile:
+ import hotshot, hotshot.stats
+ prof = hotshot.Profile('image-classifier-%s-%s.prof'%(opt.model, opt.mode))
+ prof.runcall(main)
+ prof.close()
+ stats = hotshot.stats.load('image-classifier-%s-%s.prof'%(opt.model, opt.mode))
+ stats.strip_dirs()
+ stats.sort_stats('cumtime', 'calls')
+ stats.print_stats()
+ else:
+ main()
diff --git a/dev-support/mini-submarine/submarine/run_submarine_mxnet_cifar10_tony.sh b/dev-support/mini-submarine/submarine/run_submarine_mxnet_cifar10_tony.sh
new file mode 100755
index 0000000..ba5eedb
--- /dev/null
+++ b/dev-support/mini-submarine/submarine/run_submarine_mxnet_cifar10_tony.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+while [ $# -gt 0 ]; do
+ case "$1" in
+ --debug*)
+ DEBUG=$1
+ if [ -n "$2" ]; then
+ DEBUG_PORT=$2
+ shift
+ fi
+ shift
+ ;;
+ *)
+ break
+ ;;
+ esac
+done
+
+if [ "$DEBUG" ]; then
+ if [ -z "$DEBUG_PORT" ]; then
+ DEBUG_PORT=8000
+ fi
+ JAVA_CMD="java -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=${DEBUG_PORT}"
+else
+ JAVA_CMD="java"
+fi
+
+SUBMARINE_VERSION=0.4.0-SNAPSHOT
+HADOOP_VERSION=2.9
+SUBMARINE_PATH=/opt/submarine-current
+HADOOP_CONF_PATH=/usr/local/hadoop/etc/hadoop
+
+${JAVA_CMD} -cp ${SUBMARINE_PATH}/submarine-all-${SUBMARINE_VERSION}-hadoop-${HADOOP_VERSION}.jar:${HADOOP_CONF_PATH} \
+ org.apache.submarine.client.cli.Cli job run \
+ --name mx-job-001 \
+ --framework mxnet \
+ --input_path "" \
+ --num_ps 1 \
+ --ps_resources memory=1G,vcores=1 \
+ --ps_launch_cmd "myvenv.zip/venv/bin/python image_classification.py --dataset cifar10 --model vgg11 --epochs 1 --kvstore dist_sync" \
+ --num_workers 2 \
+ --worker_resources memory=2G,vcores=1 \
+ --worker_launch_cmd "myvenv.zip/venv/bin/python image_classification.py --dataset cifar10 --model vgg11 --epochs 1 --kvstore dist_sync" \
+ --num_schedulers 1 \
+ --scheduler_resources memory=1G,vcores=1 \
+ --scheduler_launch_cmd "myvenv.zip/venv/bin/python image_classification.py --dataset cifar10 --model vgg11 --epochs 1 --kvstore dist_sync" \
+ --insecure \
+ --verbose \
+ --conf tony.containers.resources=/home/yarn/submarine/myvenv.zip#archive,/home/yarn/submarine/image_classification.py,${SUBMARINE_PATH}/submarine-all-${SUBMARINE_VERSION}-hadoop-${HADOOP_VERSION}.jar
---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscribe@submarine.apache.org
For additional commands, e-mail: dev-help@submarine.apache.org