You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/09/28 22:30:01 UTC
systemml git commit: [MINOR]bug fixes & feature added in perf test & spark-submit python scripts

Repository: systemml
Updated Branches:
  refs/heads/master 0cb2f7f68 -> a725b2d2e


[MINOR]bug fixes & feature added in perf test & spark-submit python scripts


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a725b2d2
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a725b2d2
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a725b2d2

Branch: refs/heads/master
Commit: a725b2d2ebf6dcb56f4edb68376c3849c8991b27
Parents: 0cb2f7f
Author: Nakul Jindal <na...@gmail.com>
Authored: Thu Sep 28 15:28:17 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Thu Sep 28 15:28:17 2017 -0700

----------------------------------------------------------------------
 bin/systemml-spark-submit.py            | 30 ++++++++-----
 scripts/perftest/python/datagen.py      |  2 +-
 scripts/perftest/python/predict.py      |  2 +-
 scripts/perftest/python/run_perftest.py | 19 ++++++---
 scripts/perftest/python/train.py        |  2 +-
 scripts/perftest/python/utils_misc.py   | 63 +++++++++++++++-------------
 6 files changed, 70 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/bin/systemml-spark-submit.py
----------------------------------------------------------------------
diff --git a/bin/systemml-spark-submit.py b/bin/systemml-spark-submit.py
index b6426b3..b4da801 100755
--- a/bin/systemml-spark-submit.py
+++ b/bin/systemml-spark-submit.py
@@ -92,25 +92,35 @@ def spark_submit_entry(master, driver_memory, num_executors, executor_memory,
         ml_options.append(stats)
     if gpu is not None:
         ml_options.append('-gpu')
-        ml_options.append(gpu)
+        if gpu is not 'no_option':
+            ml_options.append(gpu)
 
     if len(ml_options) < 1:
         ml_options = ''
 
     # stats, explain, target_jars
     cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript',
-                 '--master', master, '--driver-memory', driver_memory,
-                 '--num-executors', num_executors, '--executor-memory', executor_memory,
-                 '--executor-cores', executor_cores, '--conf', default_conf,
+                 '--master', master,
+                 '--driver-memory', driver_memory,
+                 '--conf', default_conf,
                  '--jars', cuda_jars, systemml_jars]
 
+    if num_executors is not None:
+        cmd_spark = cmd_spark + ['--num-executors', num_executors]
+
+    if executor_memory is not None:
+        cmd_spark = cmd_spark + ['--executor-memory', executor_memory]
+
+    if executor_cores is not None:
+        cmd_spark = cmd_spark + ['--executor-cores', executor_cores]
+
     cmd_system_ml = ['-config', default_config,
                      '-exec', 'hybrid_spark', '-f', script_file, ' '.join(ml_options)]
 
     cmd = cmd_spark + cmd_system_ml
 
     # Debug
-    # print(' '.join(cmd))
+    print(' '.join(cmd))
     return_code = os.system(' '.join(cmd))
     return return_code
 
@@ -120,10 +130,10 @@ if __name__ == '__main__':
                                       description='System-ML Spark Submit Script')
     # SPARK-SUBMIT Options
     cparser.add_argument('--master', default='local[*]', help='local, yarn-client, yarn-cluster', metavar='')
-    cparser.add_argument('--driver-memory', default='5G', help='Memory for driver (e.g. 512M)', metavar='')
-    cparser.add_argument('--num-executors', default='2', help='Number of executors to launch', metavar='')
-    cparser.add_argument('--executor-memory', default='2G', help='Memory per executor', metavar='')
-    cparser.add_argument('--executor-cores', default='1', help='Number of cores', metavar='')
+    cparser.add_argument('--driver-memory', default='8G', help='Memory for driver (e.g. 512M, 1G)', metavar='')
+    cparser.add_argument('--num-executors', nargs=1, help='Number of executors to launch', metavar='')
+    cparser.add_argument('--executor-memory', nargs=1, help='Memory per executor', metavar='')
+    cparser.add_argument('--executor-cores', nargs=1, help='Number of executor cores', metavar='')
     cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='')
 
     # SYSTEM-ML Options
@@ -138,7 +148,7 @@ if __name__ == '__main__':
                                    metavar='')
     cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
                                       'set <force> option to skip conservative memory estimates '
-                                      'and use GPU wherever possible', nargs='?')
+                                      'and use GPU wherever possible', nargs='?', const='no_option')
     cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; '
                                                    'path can be local/hdfs/gpfs', metavar='')
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py
index 9b9edf1..6794187 100755
--- a/scripts/perftest/python/datagen.py
+++ b/scripts/perftest/python/datagen.py
@@ -25,7 +25,7 @@ from os.path import join
 from utils_misc import split_rowcol, config_writer, mat_type_check
 
 # This file contains configuration settings for data generation
-DATA_FORMAT = 'csv'
+DATA_FORMAT = 'binary'
 
 MATRIX_TYPE_DICT = {'dense': '0.9',
                     'sparse': '0.01'}

http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py
index 21eed6a..67467b1 100755
--- a/scripts/perftest/python/predict.py
+++ b/scripts/perftest/python/predict.py
@@ -26,7 +26,7 @@ from utils_misc import config_writer, mat_type_check
 from utils_fs import relevant_folders
 
 # Contains configuration setting for predicting
-DATA_FORMAT = 'csv'
+DATA_FORMAT = 'binary'
 
 
 def m_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):

http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py
index dffb7a2..f0b272a 100755
--- a/scripts/perftest/python/run_perftest.py
+++ b/scripts/perftest/python/run_perftest.py
@@ -141,7 +141,7 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode,
     return exit_flag_success
 
 
-def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir):
+def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir, file_system_type):
     """
     This function is the entry point for performance testing
 
@@ -168,6 +168,9 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mo
 
     temp_dir: String
     Location to store all output files created during perf test
+
+    file_system_type: String
+
     """
     # algos to run is a list of tuples with
     # [(m-svm, binomial), (m-svm, multinomial)...]
@@ -275,6 +278,7 @@ if __name__ == '__main__':
     mat_type = ['dense', 'sparse', 'all']
     workload = ['data-gen', 'train', 'predict']
     execution_mode = ['hybrid_spark', 'singlenode']
+    file_system_type = ['hdfs', 'local']
     # Default Arguments
     default_mat_shape = ['10k_100']
 
@@ -308,7 +312,6 @@ if __name__ == '__main__':
                            'spark.driver.extraJavaOptions=\"-Xms20g -Xmn2g\"'
 
 
-
     # Argparse Module
     cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                       description='SystemML Performance Test Script')
@@ -335,8 +338,12 @@ if __name__ == '__main__':
     cparser.add_argument('--mode', default=workload,
                          help='space separated list of types of workloads to run (available: data-gen, train, predict)',
                          metavar='', choices=workload, nargs='+')
-    # Change this to temp-dir
-    cparser.add_argument('--temp-dir', help='define the file system to work on', metavar='')
+    cparser.add_argument('--temp-dir', help='the path on the file system to place the working temporary directory at',
+                         metavar='')
+    cparser.add_argument('--file-system-type', choices=file_system_type, metavar='',
+                         help='file system for temp directory, '
+                              'supported types are \'hdfs\' for hybrid_spark and \'local\' for standalone;'
+                              'default for hybrid_spark is \'hdfs\' and for standalone is \'local\'')
 
     # Configuration Options
     cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, '
@@ -347,7 +354,7 @@ if __name__ == '__main__':
     cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='')
     cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
                                       'set <force> option to skip conservative memory estimates '
-                                      'and use GPU wherever possible', nargs='?')
+                                      'and use GPU wherever possible', nargs='?', const='no_option')
     # Spark Configuration Option
     cparser.add_argument('--master', help='local, yarn-client, yarn-cluster', metavar='')
     cparser.add_argument('--driver-memory', help='Memory for driver (e.g. 512M)', metavar='')
@@ -371,7 +378,7 @@ if __name__ == '__main__':
     perftest_args_dict, systemml_args_dict, backend_args_dict = split_config_args(all_arg_dict)
 
     # temp_dir hdfs / local path check
-    perftest_args_dict['temp_dir'] = get_default_dir(args.temp_dir, args.exec_type, default_config_dir)
+    perftest_args_dict['temp_dir'] = get_default_dir(args.file_system_type, args.temp_dir, args.exec_type, default_config_dir)
 
     # default_mat_type validity
     if len(args.mat_type) > 2:

http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/train.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py
index 4717ff7..a95950d 100755
--- a/scripts/perftest/python/train.py
+++ b/scripts/perftest/python/train.py
@@ -27,7 +27,7 @@ from functools import reduce
 from utils_fs import relevant_folders
 
 # Contains configuration setting for training
-DATA_FORMAT = 'csv'
+DATA_FORMAT = 'binary'
 
 
 def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir, config_dir):

http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/utils_misc.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/utils_misc.py b/scripts/perftest/python/utils_misc.py
index 92dbc73..87b870b 100755
--- a/scripts/perftest/python/utils_misc.py
+++ b/scripts/perftest/python/utils_misc.py
@@ -54,30 +54,28 @@ def split_config_args(args):
     perftest_args_dict['filename'] = args['filename']
     perftest_args_dict['mode'] = args['mode']
     perftest_args_dict['temp_dir'] = args['temp_dir']
+    perftest_args_dict['file_system_type'] = args['file_system_type']
 
     systemml_args_dict = {}
 
-    if 'stats' in args.keys():
-        if args['stats'] is not None:
-            systemml_args_dict['-stats'] = args['stats']
-        else:
-            systemml_args_dict['-stats'] = ''
+    if args['stats'] is not None:
+        systemml_args_dict['-stats'] = args['stats']
+    else:
+        systemml_args_dict['-stats'] = ''
 
-    if 'explain' in args.keys():
-        if args['explain'] is not None:
-            systemml_args_dict['-explain'] = args['explain']
-        else:
-            systemml_args_dict['-explain'] = ''
+    if args['explain'] is not None:
+        systemml_args_dict['-explain'] = args['explain']
+    else:
+        systemml_args_dict['-explain'] = ''
 
-    if 'config' in args.keys():
-        if args['config'] is not None:
-            systemml_args_dict['-config'] = args['config']
+    if args['config'] is not None:
+        systemml_args_dict['-config'] = args['config']
 
-    if 'gpu' in args.keys():
-        if args['gpu'] is not None:
-            systemml_args_dict['-gpu'] = args['gpu']
-        else:
+    if args['gpu'] is not None:
+        if args['gpu'] == 'no_option':
             systemml_args_dict['-gpu'] = ''
+        else:
+            systemml_args_dict['-gpu'] = args['gpu']
 
     backend_args_dict = {}
     exec_type = args['exec_type']
@@ -373,8 +371,9 @@ def mat_type_check(current_family, matrix_types, dense_algos):
     return current_type
 
 
-def get_default_dir(temp_dir, exec_mode, config_dir):
+def get_default_dir(file_system_type, temp_dir, exec_mode, config_dir):
     """
+    file_system_type: String
     temp_dir: String
     exec_mode: String
     config_dir: String
@@ -390,17 +389,23 @@ def get_default_dir(temp_dir, exec_mode, config_dir):
             return temp_dir
 
     if exec_mode == 'hybrid_spark':
-        cmd = ['hdfs', 'getconf', '-confKey', 'fs.default.name']
-        hdfs_base = subprocess_exec(' '.join(cmd), extract='hdfs_base')
+        if file_system_type == 'hdfs':
+            cmd = ['hdfs', 'getconf', '-confKey', 'fs.default.name']
+            hdfs_base = subprocess_exec(' '.join(cmd), extract='hdfs_base')
 
-        if temp_dir is None:
-            hdfs_home = join(hdfs_base, 'user', getpass.getuser())
-            check_hdfs_path(hdfs_home)
-            return hdfs_home
+            if temp_dir is None:
+                hdfs_home = join(hdfs_base, 'user', getpass.getuser())
+                check_hdfs_path(hdfs_home)
+                return hdfs_home
 
-        if temp_dir is not None:
-            if temp_dir.startswith('hdfs'):
+            if temp_dir is not None:
+                if temp_dir.startswith('hdfs'):
+                    return temp_dir
+                else:
+                    hdfs_home = join(hdfs_base, 'user', getpass.getuser(), temp_dir)
+                    return hdfs_home
+        else:
+            if temp_dir is None:
+                return config_dir
+            if temp_dir is not None:
                 return temp_dir
-            else:
-                hdfs_home = join(hdfs_base, 'user', getpass.getuser(), temp_dir)
-                return hdfs_home