You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/08/01 20:47:05 UTC
[2/2] systemml git commit: [SYSTEMML-1451] phase 2 work
[SYSTEMML-1451] phase 2 work
Completed these tasks as part for Phase 2 for Google Summer of Code '17
- Decouple systemml-spark-submit.py
- Decouple systemml-standalone.py
- Refractor perf test suit to accept args like debug, stats, config etc...
- Add HDFS support
- Google Docs support
- Compare SystemML with previous versions
- Pylint, Comment
- Extra arguments configuration Test
- Windows Test
- Doc update
- systemml standalone comments
- systemml spark submit comments
Closes #575
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e94374af
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e94374af
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e94374af
Branch: refs/heads/master
Commit: e94374afb2e6be5dc81524f9c7a5de09b9f4ba26
Parents: a2db1ad
Author: krishnakalyan3 <kr...@gmail.com>
Authored: Tue Aug 1 13:46:30 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Tue Aug 1 13:46:30 2017 -0700
----------------------------------------------------------------------
bin/systemml-spark-submit.py | 278 +++++++--------
bin/systemml-standalone.py | 256 +++++---------
bin/utils.py | 113 ++++++
docs/python-performance-test.md | 35 +-
scripts/perftest/python/datagen.py | 141 ++++----
scripts/perftest/python/google_docs/stats.py | 113 ++++++
scripts/perftest/python/google_docs/update.py | 110 ++++++
scripts/perftest/python/predict.py | 156 ++++-----
scripts/perftest/python/run_perftest.py | 135 ++++---
scripts/perftest/python/train.py | 257 +++++++-------
scripts/perftest/python/utils.py | 390 ---------------------
scripts/perftest/python/utils_exec.py | 137 ++++++++
scripts/perftest/python/utils_fs.py | 162 +++++++++
scripts/perftest/python/utils_misc.py | 347 ++++++++++++++++++
14 files changed, 1580 insertions(+), 1050 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/systemml-spark-submit.py
----------------------------------------------------------------------
diff --git a/bin/systemml-spark-submit.py b/bin/systemml-spark-submit.py
index 30974ec..b6426b3 100755
--- a/bin/systemml-spark-submit.py
+++ b/bin/systemml-spark-submit.py
@@ -21,167 +21,131 @@
# -------------------------------------------------------------
import os
-import sys
-from os.path import join, exists, abspath
-from os import environ
import glob
-import argparse
-import shutil
+from os.path import join
import platform
-
-if environ.get('SPARK_HOME') is None:
- print('SPARK_HOME not set')
- sys.exit(1)
-else:
- spark_home = environ.get('SPARK_HOME')
+import argparse
+from utils import get_env_systemml_home, get_env_spark_home, find_dml_file, log4j_path, config_path
+
+
+def default_jars(systemml_home):
+ """
+ return: String
+ Location of systemml and jcuda jars
+ """
+ build_dir = join(systemml_home, 'target')
+ lib_dir = join(build_dir, 'lib')
+ systemml_jar = build_dir + os.sep + "SystemML.jar"
+ jcuda_jars = glob.glob(lib_dir + os.sep + "jcu*.jar")
+ target_jars = ','.join(jcuda_jars)
+ return target_jars, systemml_jar
+
+
+def spark_submit_entry(master, driver_memory, num_executors, executor_memory,
+ executor_cores, conf,
+ nvargs, args, config, explain, debug, stats, gpu, f):
+ """
+ This function is responsible for the execution of arguments via
+ subprocess call in hybrid_spark mode
+ """
+
+ spark_home = get_env_spark_home()
+ systemml_home = get_env_systemml_home()
spark_path = join(spark_home, 'bin', 'spark-submit')
+ script_file = find_dml_file(systemml_home, f)
+ # Jars
+ cuda_jars, systemml_jars = default_jars(systemml_home)
-# error help print
-def print_usage_and_exit():
- print('Usage: ./systemml-spark-submit.py -f <dml-filename> [arguments]')
- sys.exit(1)
-
-cparser = argparse.ArgumentParser(description='System-ML Spark Submit Script')
-
-# SPARK-SUBMIT Options
-cparser.add_argument('--master', default='local[*]', help='local, yarn-client, yarn-cluster', metavar='')
-cparser.add_argument('--driver-memory', default='5G', help='Memory for driver (e.g. 512M)', metavar='')
-cparser.add_argument('--num-executors', default='2', help='Number of executors to launch', metavar='')
-cparser.add_argument('--executor-memory', default='2G', help='Memory per executor', metavar='')
-cparser.add_argument('--executor-cores', default='1', help='Number of cores', metavar='')
-cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='')
-
-# SYSTEM-ML Options
-cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='')
-cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+')
-cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='')
-cparser.add_argument('-exec', default='hybrid_spark', help='System-ML backend (e.g spark, spark-hybrid)', metavar='')
-cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, '
- 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='')
-cparser.add_argument('-debug', help='runs in debug mode', action='store_true')
-cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, '
- 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10', metavar='')
-cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
- 'set <force> option to skip conservative memory estimates '
- 'and use GPU wherever possible', nargs='?')
-cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; '
- 'path can be local/hdfs/gpfs', metavar='')
-
-args = cparser.parse_args()
-
-# Optional arguments
-ml_options = []
-if args.nvargs is not None:
- ml_options.append('-nvargs')
- ml_options.append(' '.join(args.nvargs))
-if args.args is not None:
- ml_options.append('-args')
- ml_options.append(' '.join(args.args))
-if args.debug is not False:
- ml_options.append('-debug')
-if args.explain is not None:
- ml_options.append('-explain')
- ml_options.append(args.explain)
-if args.gpu is not None:
- ml_options.append('-gpu')
- ml_options.append(args.gpu)
-if args.stats is not None:
- ml_options.append('-stats')
- ml_options.append(args.stats)
-
-# Assign script file to name received from argparse module
-script_file = args.f
-
-# find the systemML root path which contains the bin folder, the script folder and the target folder
-# tolerate path with spaces
-script_dir = os.path.dirname(os.path.realpath(__file__))
-project_root_dir = os.path.dirname(script_dir)
-user_dir = os.getcwd()
-
-scripts_dir = join(project_root_dir, 'scripts')
-build_dir = join(project_root_dir, 'target')
-lib_dir = join(build_dir, 'lib')
-
-systemml_jar = build_dir + os.sep + "SystemML.jar"
-jcuda_jars = glob.glob(lib_dir + os.sep + "jcu*.jar")
-target_jars = ','.join(jcuda_jars) # Include all JCuda Jars
-
-log4j_properties_path = join(project_root_dir, 'conf', 'log4j.properties.template')
-
-build_err_msg = 'You must build the project before running this script.'
-build_dir_err_msg = 'Could not find target directory ' + build_dir + '. ' + build_err_msg
-
-# check if the project had been built and the jar files exist
-if not (exists(build_dir)):
- print(build_dir_err_msg)
- sys.exit(1)
-
-print('================================================================================')
-
-# if the present working directory is the project root or bin folder, then use the temp folder as user.dir
-if user_dir == project_root_dir or user_dir == join(project_root_dir, 'bin'):
- user_dir = join(project_root_dir, 'temp')
- print('Output dir: ' + user_dir)
-
-# if the SystemML-config.xml does not exist, create it from the template
-systemml_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml')
-systemml_template_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml.template')
-if not (exists(systemml_config_path)):
- shutil.copyfile(systemml_template_config_path, systemml_config_path)
- print('... created ' + systemml_config_path)
-
-# if SystemML-config.xml is provided as arguments
-if args.config is None:
- systemml_config_path_arg = systemml_config_path
-else:
- systemml_config_path_arg = args.config
-
-
-# from http://stackoverflow.com/questions/1724693/find-a-file-in-python
-def find_file(name, path):
- for root, dirs, files in os.walk(path):
- if name in files:
- return join(root, name)
- return None
-
-# if the script file path was omitted, try to complete the script path
-if not (exists(script_file)):
- script_file_name = abspath(script_file)
- script_file_found = find_file(script_file, scripts_dir)
- if script_file_found is None:
- print('Could not find DML script: ' + script_file)
- print_usage_and_exit()
+ # Log4j
+ log4j = log4j_path(systemml_home)
+ log4j_properties_path = 'spark.driver.extraJavaOptions=-Dlog4j.configuration=file:{}'.format(log4j)
+ if conf is None:
+ default_conf = log4j_properties_path
else:
- script_file = script_file_found
- print('DML Script:' + script_file)
-
-default_conf = 'spark.driver.extraJavaOptions=-Dlog4j.configuration=file:{}'.format(log4j_properties_path)
-
-# Backslash problem in windows.
-if platform.system() == 'Windows':
- default_conf = default_conf.replace('\\', '//')
-
-if args.conf is not None:
- conf = ' --conf '.join(args.conf + [default_conf])
-else:
- conf = default_conf
+ default_conf = ' --conf '.join(conf + [log4j_properties_path])
-cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript',
- '--master', args.master, '--driver-memory', args.driver_memory,
- '--num-executors', args.num_executors, '--executor-memory', args.executor_memory,
- '--executor-cores', args.executor_cores, '--conf', conf, '--jars', target_jars,
- systemml_jar]
-
-cmd_system_ml = ['-config', systemml_config_path_arg,
- '-exec', vars(args)['exec'], '-f', script_file, ' '.join(ml_options)]
-
-cmd = cmd_spark + cmd_system_ml
-
-return_code = os.system(' '.join(cmd))
-# For debugging
-# print(' '.join(cmd))
-
-if return_code != 0:
- print('Failed to run SystemML. Exit code :' + str(return_code))
- print(' '.join(cmd))
+ # Config XML
+ if config is None:
+ default_config = config_path(systemml_home)
+ else:
+ default_config = ' -config '.join([config] + [config_path(systemml_home)])
+
+ if platform.system() == 'Windows':
+ default_conf = default_conf.replace('\\', '//')
+
+ # optional arguments
+ ml_options = []
+ if nvargs is not None:
+ ml_options.append('-nvargs')
+ ml_options.append(' '.join(nvargs))
+ if args is not None:
+ ml_options.append('-args')
+ ml_options.append(' '.join(args))
+ if explain is not None:
+ ml_options.append('-explain')
+ ml_options.append(explain)
+ if debug is not False:
+ ml_options.append('-debug')
+ if stats is not None:
+ ml_options.append('-stats')
+ ml_options.append(stats)
+ if gpu is not None:
+ ml_options.append('-gpu')
+ ml_options.append(gpu)
+
+ if len(ml_options) < 1:
+ ml_options = ''
+
+ # stats, explain, target_jars
+ cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript',
+ '--master', master, '--driver-memory', driver_memory,
+ '--num-executors', num_executors, '--executor-memory', executor_memory,
+ '--executor-cores', executor_cores, '--conf', default_conf,
+ '--jars', cuda_jars, systemml_jars]
+
+ cmd_system_ml = ['-config', default_config,
+ '-exec', 'hybrid_spark', '-f', script_file, ' '.join(ml_options)]
+
+ cmd = cmd_spark + cmd_system_ml
+
+ # Debug
+ # print(' '.join(cmd))
+ return_code = os.system(' '.join(cmd))
+ return return_code
+
+
+if __name__ == '__main__':
+ cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='System-ML Spark Submit Script')
+ # SPARK-SUBMIT Options
+ cparser.add_argument('--master', default='local[*]', help='local, yarn-client, yarn-cluster', metavar='')
+ cparser.add_argument('--driver-memory', default='5G', help='Memory for driver (e.g. 512M)', metavar='')
+ cparser.add_argument('--num-executors', default='2', help='Number of executors to launch', metavar='')
+ cparser.add_argument('--executor-memory', default='2G', help='Memory per executor', metavar='')
+ cparser.add_argument('--executor-cores', default='1', help='Number of cores', metavar='')
+ cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='')
+
+ # SYSTEM-ML Options
+ cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='')
+ cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+')
+ cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='')
+ cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, '
+ 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='')
+ cparser.add_argument('-debug', help='runs in debug mode', action='store_true')
+ cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, '
+ 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10',
+ metavar='')
+ cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
+ 'set <force> option to skip conservative memory estimates '
+ 'and use GPU wherever possible', nargs='?')
+ cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; '
+ 'path can be local/hdfs/gpfs', metavar='')
+
+ args = cparser.parse_args()
+ arg_dict = vars(args)
+
+ return_code = spark_submit_entry(**arg_dict)
+
+ if return_code != 0:
+ print('Failed to run SystemML. Exit code :' + str(return_code))
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/systemml-standalone.py
----------------------------------------------------------------------
diff --git a/bin/systemml-standalone.py b/bin/systemml-standalone.py
index a0ee8db..4000e75 100755
--- a/bin/systemml-standalone.py
+++ b/bin/systemml-standalone.py
@@ -21,180 +21,106 @@
#-------------------------------------------------------------
import os
-import shutil
-import sys
-from os.path import join, exists
+from os.path import join
+import argparse
+import platform
+from utils import get_env_systemml_home, find_dml_file, log4j_path, config_path
-# error help print
-def print_usage_and_exit():
- this_script = sys.argv[0]
- print('Usage: ' + this_script + ' <dml-filename> [arguments]')
- sys.exit(1)
+def default_classpath(systemml_home):
+ """
+ Classpath information required for excution
+ return: String
+ Classpath location of build, library and hadoop directories
+ """
+ build_lib = join(systemml_home, 'target', '*')
+ lib_lib = join(systemml_home, 'target', 'lib', '*')
+ hadoop_lib = join(systemml_home, 'target', 'lib', 'hadoop', '*')
-# from http://stackoverflow.com/questions/1724693/find-a-file-in-python
-def find_file(name, path):
- for root, dirs, files in os.walk(path):
- if name in files:
- return join(root, name)
- return None
+ return build_lib, lib_lib, hadoop_lib
-if len(sys.argv) < 2:
- print('Wrong usage')
- print_usage_and_exit()
+#TODO
+# User dir, fix for SYSTEMML_1795
+def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, f):
+ """
+ This function is responsible for the execution of arguments via
+ subprocess call in singlenode mode
+ """
+ systemml_home = get_env_systemml_home()
+ script_file = find_dml_file(systemml_home, f)
-# find the systemML root path which contains the bin folder, the script folder and the target folder
-# tolerate path with spaces
-script_dir = os.path.dirname(os.path.realpath(__file__))
-project_root_dir = os.path.dirname(script_dir)
-user_dir = os.getcwd()
-
-scripts_dir = join(project_root_dir, 'scripts')
-build_dir = join(project_root_dir, 'target')
-lib_dir = join(build_dir, 'lib')
-dml_script_class = join(build_dir, 'classes', 'org', 'apache', 'sysml', 'api', 'DMLScript.class')
-hadoop_home = join(lib_dir, 'hadoop')
-
-
-build_err_msg = 'You must build the project before running this script.'
-build_dir_err_msg = 'Could not find target directory ' + build_dir + '. ' + build_err_msg
-
-lib_dir_err_msg = 'Could not find required libraries.' + build_err_msg
-dml_script_err_msg = 'Could not find ' + dml_script_class + '. ' + build_err_msg
-
-# check if the project had been built and the jar files exist
-if not(exists(build_dir)):
- print(build_dir_err_msg)
- sys.exit(1)
-if not(exists(lib_dir)):
- print(lib_dir_err_msg)
- sys.exit(1)
-if not(exists(dml_script_class)):
- print(dml_script_err_msg)
- sys.exit(1)
-
-print('================================================================================')
-
-
-# if the present working directory is the project root or bin folder, then use the temp folder as user.dir
-if user_dir == project_root_dir or user_dir == join(project_root_dir, 'bin'):
- user_dir = join(project_root_dir, 'temp')
- print('Output dir: ' + user_dir)
-
-# if the SystemML-config.xml does not exist, create it from the template
-systemml_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml')
-systemml_template_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml.template')
-if not(exists(systemml_config_path)):
- shutil.copyfile(systemml_template_config_path, systemml_config_path)
- print('... created ' + systemml_config_path)
-
-# if the log4j.properties do not exist, create them from the template
-log4j_properties_path = join(project_root_dir, 'conf', 'log4j.properties')
-log4j_template_properties_path = join(project_root_dir, 'conf', 'log4j.properties.template')
-if not(exists(log4j_properties_path)):
- shutil.copyfile(log4j_template_properties_path, log4j_properties_path)
- print('... created ' + log4j_properties_path)
+ if platform.system() == 'Windows':
+ default_cp = ';'.join(default_classpath(systemml_home))
+ else:
+ default_cp = ':'.join(default_classpath(systemml_home))
+ java_memory = '-Xmx8g -Xms4g -Xmn1g'
-script_file = sys.argv[1]
+ # Log4j
+ log4j = log4j_path(systemml_home)
+ log4j_properties_path = '-Dlog4j.configuration=file:{}'.format(log4j)
-# if the script file path was omitted, try to complete the script path
-if not(exists(script_file)):
- script_file_name = os.path.abspath(script_file)
- script_file_found = find_file(script_file, scripts_dir)
- if script_file_found is None:
- print('Could not find DML script: ' + script_file)
- print_usage_and_exit()
+ # Config
+ if config is None:
+ default_config = config_path(systemml_home)
else:
- script_file = script_file_found
- print('DML Script:' + script_file)
-
-# add libraries which were generated by the build to the classpath
-systemml_jar = join(build_dir, 'classes')
-
-
-# For the *nix and windows, os.pathsep works reliably
-# however for cygwin, the pathsep is set for *nix, which is ':'
-# but the underlying java, which is a windows program requires ';'
-# also all arguments passed to the JVM need to be converted to windows style
-# if in the cygwin environment
-lib_dir_star = join(lib_dir, '*')
-if sys.platform == 'cygwin':
- classpath_sep = os.pathsep
- classpath_sep = ';'
- lib_dir = os.popen('cygpath -pw ' + lib_dir).read().strip()
- lib_dir_star = '"' + lib_dir + "\*" + '"'
- systemml_jar = '"' + os.popen('cygpath -pw ' + systemml_jar).read().strip() + '"'
- hadoop_home = '"' + os.popen('cygpath -pw ' + hadoop_home).read().strip() + '"'
- log4j_properties_path = '"' + os.popen('cygpath -pw ' + log4j_properties_path).read().strip() + '"'
- user_dir = '"' + os.popen('cygpath -pw ' + user_dir).read().strip() + '"'
- script_file = '"' + os.popen('cygpath -pw ' + script_file).read().strip() + '"'
- systemml_config_path = '"' + os.popen('cygpath -pw ' + systemml_config_path).read().strip() + '"'
- classpath = lib_dir_star + '\\' + classpath_sep + systemml_jar
-else:
- #classpath = '"' + lib_dir_star + '"' + os.pathsep + '"' + systemml_jar + '"'
- classpath = lib_dir_star + os.pathsep + systemml_jar
-
-
-# Set the HADOOP_HOME environment variable
-if 'HADOOP_HOME' not in os.environ:
- os.environ['HADOOP_HOME'] = hadoop_home
-
-
-print('================================================================================')
-
-# Set default Java options
-systemml_default_java_opts = \
- '-Xmx8g -Xms4g -Xmn1g ' + \
- '-cp ' + classpath + ' ' + \
- '-Dlog4j.configuration=file:' + log4j_properties_path + ' ' \
- '-Duser.dir=' + user_dir
-# '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=8111'
-
-
-# Reads in key-value pairs from the conf/systemml-env.sh file
-def parse_env_file(env_file_path):
- env_vars = {}
- with open(env_file_path) as f:
- for l in f:
- l = l.strip()
- if l and not(l.startswith('#')) and '=' in l:
- k, v = l.split('=', 1)
- k = k.strip()
- v = v.strip()
- if len(v) > 0:
- # strip quotes
- if v[0] == v[len(v) - 1] and v[0] in ['"', "'"]:
- v = v[1:-1]
-
- env_vars[k] = v
- return env_vars
-
-
-# Add any custom Java options set by the user at command line, overriding defaults as necessary.
-if 'SYSTEMML_JAVA_OPTS' in os.environ:
- systemml_java_opts = os.environ['SYSTEMML_JAVA_OPTS']
- systemml_default_java_opts = systemml_default_java_opts + ' ' + systemml_java_opts
- # del os.environ['SYSTEMML_JAVA_OPTS']
-
-# Add any custom Java options set by the user in the environment variables file,
-# overriding defaults as necessary.
-systemml_env_path = join(project_root_dir, 'conf', 'systemml-env.sh')
-if exists(systemml_env_path):
- env_vars = parse_env_file(systemml_env_path)
- os.environ.update(env_vars)
-
-
-# Invoke the jar with options and arguments
-cmd = ['java', systemml_default_java_opts, 'org.apache.sysml.api.DMLScript', '-f', script_file, '-exec singlenode', '-config', systemml_config_path] + sys.argv[2:]
-# For debugging
-# print(' '.join(cmd))
-
-return_code = os.system(' '.join(cmd))
-
-if return_code != 0:
- print('Failed to run SystemML. Exit code :' + str(return_code))
- print(' '.join(cmd))
+ default_config = config
+
+ ml_options = []
+ if nvargs is not None:
+ ml_options.append('-nvargs')
+ ml_options.append(' '.join(nvargs))
+ if args is not None:
+ ml_options.append('-args')
+ ml_options.append(' '.join(args))
+ if explain is not None:
+ ml_options.append('-explain')
+ ml_options.append(explain)
+ if debug is not False:
+ ml_options.append('-debug')
+ if stats is not None:
+ ml_options.append('-stats')
+ ml_options.append(stats)
+ if gpu is not None:
+ ml_options.append('-gpu')
+ ml_options.append(gpu)
+
+ cmd = ['java', java_memory, log4j_properties_path,
+ '-cp', default_cp, 'org.apache.sysml.api.DMLScript',
+ '-f', script_file, '-exec', 'singlenode', '-config', default_config,
+ ' '.join(ml_options)]
+
+ return_code = os.system(' '.join(cmd))
+ return return_code
+
+
+if __name__ == '__main__':
+
+ cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='System-ML Standalone Script')
+
+ # SYSTEM-ML Options
+ cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='')
+ cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+')
+ cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='')
+ cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, '
+ 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='')
+ cparser.add_argument('-debug', help='runs in debug mode', action='store_true')
+ cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, '
+ 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10',
+ metavar='')
+ cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
+ 'set <force> option to skip conservative memory estimates '
+ 'and use GPU wherever possible', nargs='?')
+ cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; '
+ 'path can be local/hdfs/gpfs', metavar='')
+
+ args = cparser.parse_args()
+ arg_dict = vars(args)
+ return_code = standalone_execution_entry(**arg_dict)
+
+ if return_code != 0:
+ print('Failed to run SystemML. Exit code :' + str(return_code))
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/utils.py
----------------------------------------------------------------------
diff --git a/bin/utils.py b/bin/utils.py
new file mode 100644
index 0000000..6f40881
--- /dev/null
+++ b/bin/utils.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import sys
+import os
+from os.path import join, exists
+from os import environ
+import shutil
+
+
+def get_env_systemml_home():
+ """
+ Env variable error check and path location
+
+ return: String
+ Location of SYSTEMML_HOME
+ """
+ systemml_home = os.environ.get('SYSTEMML_HOME')
+ if systemml_home is None:
+ print('SYSTEMML_HOME not found')
+ sys.exit()
+
+ return systemml_home
+
+
+def get_env_spark_home():
+ """
+ Env variable error check and path location
+
+ return: String
+ Location of SPARK_HOME
+ """
+ spark_home = environ.get('SPARK_HOME')
+ if spark_home is None:
+ print('SPARK_HOME not found')
+ sys.exit()
+
+ return spark_home
+
+
+def find_file(name, path):
+ """
+ Responsible for finding a specific file recursively given a location
+ """
+ for root, dirs, files in os.walk(path):
+ if name in files:
+ return join(root, name)
+
+
+def find_dml_file(systemml_home, script_file):
+ """
+ Find the location of DML script being executed
+
+ return: String
+ Location of the dml script
+ """
+ scripts_dir = join(systemml_home, 'scripts')
+ if not (exists(script_file)):
+ script_file = find_file(script_file, scripts_dir)
+ if script_file is None:
+ print('Could not find DML script: ' + script_file)
+ sys.exit()
+
+ return script_file
+
+
+def log4j_path(systemml_home):
+ """
+ Create log4j.properties from the template if not exist
+
+ return: String
+ Location of log4j.properties path
+ """
+ log4j_properties_path = join(systemml_home, 'conf', 'log4j.properties')
+ log4j_template_properties_path = join(systemml_home, 'conf', 'log4j.properties.template')
+ if not (exists(log4j_properties_path)):
+ shutil.copyfile(log4j_template_properties_path, log4j_properties_path)
+ print('... created ' + log4j_properties_path)
+ return log4j_properties_path
+
+
+def config_path(systemml_home):
+ """
+ Create SystemML-config from the template if not exist
+
+ return: String
+ Location of SystemML-config.xml
+ """
+ systemml_config_path = join(systemml_home, 'conf', 'SystemML-config.xml')
+ systemml_template_config_path = join(systemml_home, 'conf', 'SystemML-config.xml.template')
+ if not (exists(systemml_config_path)):
+ shutil.copyfile(systemml_template_config_path, systemml_config_path)
+ print('... created ' + systemml_config_path)
+ return systemml_config_path
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/docs/python-performance-test.md
----------------------------------------------------------------------
diff --git a/docs/python-performance-test.md b/docs/python-performance-test.md
index c265bc6..02d3e34 100644
--- a/docs/python-performance-test.md
+++ b/docs/python-performance-test.md
@@ -11,10 +11,13 @@ Our performance tests suit contains `7` families namely `binomial`, `multinomial
On a very high level use construct a string with arguments required to run each operation. Once this string is constructed we use the subprocess module to execute this string and extract time from the standard out.
-We also use `json` module write our configurations to a json file. This ensure that our current operation is easy to debug.
+We also use `json` module write our configurations to a json file. This ensure that our operations are easy to debug.
+We have `7` files in performance test suit:
-We have `5` files in performance test suit `run_perftest.py`, `datagen.py`, `train.py`, `predict.py` and `utils.py`.
+- Entry File `run_perftest.py`
+- Supporting Files `datagen.py`, `train.py`, `predict.py`
+- Utility Files `utils_exec.py`, `utils_fs.py`, `utils_misc.py`
`datagen.py`, `train.py` and `predict.py` generate a dictionary. Our key is the name of algorithm being processed and values is a list with path(s) where all the data required is present. We define this dictionary as a configuration packet.
@@ -28,7 +31,7 @@ In `train.py` script we have functions required to generate training output. We
The file `predict.py` contains all functions for all algorithms in the performance test that contain predict script. We return the required configuration packet as a result of this script, that contains key as the algorithm to run and values with location to read predict json files from.
-In the file `utils.py` we have all the helper functions required in our performance test. These functions do operations like write `json` files, extract time from std out etc.
+In the file(s) `utils_*.py` we have all the helper functions required in our performance test. These functions do operations like write `json` files, extract time from std out etc.
### Adding New Algorithms
While adding a new algorithm we need know if it has to be part of the any pre existing family. If this algorithm depends on a new data generation script we would need to create a new family. Steps below to take below to add a new algorithm.
@@ -75,7 +78,7 @@ Default setting for our performance test below:
- Matrix size to 10,000 rows and 100 columns.
- Execution mode `singlenode`.
- Operation modes `data-gen`, `train` and `predict` in sequence.
-- Matrix type set to `all`. Which will generate `dense` or / and `sparse` matrices for all relevant algorithms.
+- Matrix type set to `all`. Which will generate `dense`, `sparse` matrices for all relevant algorithms.
### Examples
Some examples of SystemML performance test with arguments shown below:
@@ -104,6 +107,9 @@ Run performance test for the algorithms `m-svm` with `multinomial` family. Run o
`
Run performance test for all algorithms under the family `regression2` and log with filename `new_log`.
+`./scripts/perftest/python/run_perftest.py --family binomial clustering multinomial regression1 regression2 stats1 stats2 --config-dir /Users/krishna/open-source/systemml/scripts/perftest/temp3 --temp-dir hdfs://localhost:9000/temp3`
+Run performance test for all algorithms using HDFS.
+
### Operational Notes
All performance test depend mainly on two scripts for execution `systemml-standalone.py` and `systemml-spark-submit.py`. Incase we need to change standalone or spark parameters we need to manually change these parameters in their respective scripts.
@@ -117,13 +123,26 @@ multinomial|data-gen|0|dense|10k_100| 0.33
MultiLogReg|train|0|10k_100|dense|6.956
MultiLogReg|predict|0|10k_100|dense|4.780
-These logs can be found in `temp` folder (`$SYSTEMML_HOME/scripts/perftest/temp`) in-case not overridden by `--temp-dir`. This `temp` folders also contain the data generated during our performance test.
+These logs and config `json` files can be found in `temp` folder (`$SYSTEMML_HOME/scripts/perftest/temp`) in-case not overridden by `--config-dir`.
+
+`--temp-dir` by default points to local file system. We can change this to point to a hdfs path by `--temp-dir hdfs://localhost:9000/temp` where all files generated during execution will be saved.
+
+Every time a script executes in `data-gen` mode successfully, we write a `_SUCCESS` file. If this file exists we ensures that re-runs of the same script is not possible. Support for configuration options like `-stats`, `-explain`, `--conf` have also been added.
+
+Results obtained by our performance tests can be automatically uploaded to google docs.
+
+`./update.py --file ../temp/singlenode.out --exec-mode singlenode --auth client_json.json --tag 1.0`
+
+In the example above `--tag` can be a major/minor systemml version and `--auth` points to the `json` key required by `google docs`.
+
+Currently we only support time difference between algorithms in different versions. This can be obtained by running the script below
+`./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0`
-Every time a script executes in `data-gen` mode successfully, we write a `_SUCCESS` file. If this file exists we ensures that re-run of the same script is not possible as data already exists.
+Note: Please pip install `https://github.com/burnash/gspread` to use google docs client.
### Troubleshooting
We can debug the performance test by making changes in the following locations based on
-- Please see `utils.py` function `exec_dml_and_parse_time`. In uncommenting the debug print statement in the function `exec_dml_and_parse_time`. This allows us to inspect the subprocess string being executed.
+- Please see `utils_exec.py` function `subprocess_exec`.
- Please see `run_perftest.py`. Changing the verbosity level to `0` allows us to log more information while the script runs.
-- Eyeballing the json files generated and making sure the arguments are correct.
+- Eyeballing the json files generated and making sure the configuration arguments are correct.
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py
index 88a71f0..72a0627 100755
--- a/scripts/perftest/python/datagen.py
+++ b/scripts/perftest/python/datagen.py
@@ -22,7 +22,7 @@
import itertools
from os.path import join
-from utils import split_rowcol, config_writer, mat_type_check
+from utils_misc import split_rowcol, config_writer, mat_type_check
# This file contains configuration settings for data generation
DATA_FORMAT = 'csv'
@@ -33,42 +33,44 @@ MATRIX_TYPE_DICT = {'dense': '0.9',
FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2']
-def multinomial_datagen(matrix_dim, matrix_type, datagen_dir):
+def multinomial_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+ path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
row, col = split_rowcol(matrix_dim)
- path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
- full_path = join(datagen_dir, path_name)
numSamples = row
numFeatures = col
sparsity = MATRIX_TYPE_DICT[matrix_type]
num_categories = '150'
intercept = '0'
- X = join(full_path, 'X.data')
- Y = join(full_path, 'Y.data')
+ X = join(datagen_write, 'X.data')
+ Y = join(datagen_write, 'Y.data')
fmt = DATA_FORMAT
config = [numSamples, numFeatures, sparsity, num_categories, intercept,
X, Y, fmt, '1']
- config_writer(full_path + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path
+ return save_path
-def binomial_datagen(matrix_dim, matrix_type, datagen_dir):
+def binomial_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+ path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
row, col = split_rowcol(matrix_dim)
- path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
- full_path = join(datagen_dir, path_name)
numSamples = row
numFeatures = col
maxFeatureValue = '5'
maxWeight = '5'
- loc_weights = join(full_path, 'weight.data')
- loc_data = join(full_path, 'X.data')
- loc_labels = join(full_path, 'Y.data')
+ loc_weights = join(datagen_write, 'weight.data')
+ loc_data = join(datagen_write, 'X.data')
+ loc_labels = join(datagen_write, 'Y.data')
noise = '1'
intercept = '0'
sparsity = MATRIX_TYPE_DICT[matrix_type]
@@ -77,24 +79,25 @@ def binomial_datagen(matrix_dim, matrix_type, datagen_dir):
config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
- config_writer(full_path + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path
+ return save_path
-def regression1_datagen(matrix_dim, matrix_type, datagen_dir):
+def regression1_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+ path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
row, col = split_rowcol(matrix_dim)
- path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
- full_path = join(datagen_dir, path_name)
numSamples = row
numFeatures = col
maxFeatureValue = '5'
maxWeight = '5'
- loc_weights = join(full_path, 'weight.data')
- loc_data = join(full_path, 'X.data')
- loc_labels = join(full_path, 'Y.data')
+ loc_weights = join(datagen_write, 'weight.data')
+ loc_data = join(datagen_write, 'X.data')
+ loc_labels = join(datagen_write, 'Y.data')
noise = '1'
intercept = '0'
sparsity = MATRIX_TYPE_DICT[matrix_type]
@@ -103,24 +106,25 @@ def regression1_datagen(matrix_dim, matrix_type, datagen_dir):
config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
- config_writer(full_path + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path
+ return save_path
-def regression2_datagen(matrix_dim, matrix_type, datagen_dir):
+def regression2_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+ path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
row, col = split_rowcol(matrix_dim)
- path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
- full_path = join(datagen_dir, path_name)
numSamples = row
numFeatures = col
maxFeatureValue = '5'
maxWeight = '5'
- loc_weights = join(full_path, 'weight.data')
- loc_data = join(full_path, 'X.data')
- loc_labels = join(full_path, 'Y.data')
+ loc_weights = join(datagen_write, 'weight.data')
+ loc_data = join(datagen_write, 'X.data')
+ loc_labels = join(datagen_write, 'Y.data')
noise = '1'
intercept = '0'
sparsity = MATRIX_TYPE_DICT[matrix_type]
@@ -129,21 +133,22 @@ def regression2_datagen(matrix_dim, matrix_type, datagen_dir):
config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
- config_writer(full_path + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path
+ return save_path
-def clustering_datagen(matrix_dim, matrix_type, datagen_dir):
+def clustering_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
- row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)])
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
+ row, col = split_rowcol(matrix_dim)
- full_path = join(datagen_dir, path_name)
- X = join(full_path, 'X.data')
- Y = join(full_path, 'Y.data')
- YbyC = join(full_path, 'YbyC.data')
- C = join(full_path, 'C.data')
+ X = join(datagen_write, 'X.data')
+ Y = join(datagen_write, 'Y.data')
+ YbyC = join(datagen_write, 'YbyC.data')
+ C = join(datagen_write, 'C.data')
nc = '50'
dc = '10.0'
dr = '1.0'
@@ -153,22 +158,24 @@ def clustering_datagen(matrix_dim, matrix_type, datagen_dir):
config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, C=C, Y=Y,
YbyC=YbyC, fmt=DATA_FORMAT)
- config_writer(full_path + '.json', config)
- return full_path
+ config_writer(save_path + '.json', config)
+ return save_path
-def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
+def stats1_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
- row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)])
- full_path = join(datagen_dir, path_name)
-
- DATA = join(full_path, 'X.data')
- TYPES = join(full_path, 'types')
- TYPES1 = join(full_path, 'set1.types')
- TYPES2 = join(full_path, 'set2.types')
- INDEX1 = join(full_path, 'set1.indices')
- INDEX2 = join(full_path, 'set2.indices')
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
+
+ row, col = split_rowcol(matrix_dim)
+
+ DATA = join(datagen_write, 'X.data')
+ TYPES = join(datagen_write, 'types')
+ TYPES1 = join(datagen_write, 'set1.types')
+ TYPES2 = join(datagen_write, 'set2.types')
+ INDEX1 = join(datagen_write, 'set1.indices')
+ INDEX2 = join(datagen_write, 'set2.indices')
MAXDOMAIN = '1100'
SETSIZE = '20'
LABELSETSIZE = '10'
@@ -184,30 +191,31 @@ def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1,
INDEX2=INDEX2, fmt=DATA_FORMAT)
- config_writer(full_path + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path
+ return save_path
-def stats2_datagen(matrix_dim, matrix_type, datagen_dir):
+def stats2_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
- row, col = split_rowcol(matrix_dim)
path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)])
- full_path = join(datagen_dir, path_name)
+ datagen_write = join(datagen_dir, path_name)
+ save_path = join(config_dir, path_name)
+ row, col = split_rowcol(matrix_dim)
- D = join(full_path, 'X.data')
- Xcid = join(full_path, 'Xcid.data')
- Ycid = join(full_path, 'Ycid.data')
- A = join(full_path, 'A.data')
+ D = join(datagen_write, 'X.data')
+ Xcid = join(datagen_write, 'Xcid.data')
+ Ycid = join(datagen_write, 'Ycid.data')
+ A = join(datagen_write, 'A.data')
config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid,
A=A, fmt=DATA_FORMAT)
- config_writer(full_path + '.json', config)
- return full_path
+ config_writer(save_path + '.json', config)
+ return save_path
-def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos):
+def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos, config_dir):
"""
This function has two responsibilities. Generate the configuration files for
datagen algorithms and return a dictionary that will be used for execution.
@@ -228,13 +236,14 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir,
dense_algos: List
Algorithms that support only dense matrix type
+ config_dir: String
+ Location to store
+
return: Dictionary {string: list}
This dictionary contains algorithms to be executed as keys and the path of configuration
json files to be executed list of values.
"""
-
config_bundle = {}
-
distinct_families = set(map(lambda x: x[1], algo_payload))
# Cross Product of all configurations
@@ -249,7 +258,7 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir,
config_packets[current_family] = []
for size, type in configs:
family_func = current_family.lower() + '_datagen'
- conf_path = globals()[family_func](size, type, datagen_dir)
+ conf_path = globals()[family_func](size, type, datagen_dir, config_dir)
config_packets[current_family].append(conf_path)
return config_packets
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/google_docs/stats.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/google_docs/stats.py b/scripts/perftest/python/google_docs/stats.py
new file mode 100755
index 0000000..3b89abe
--- /dev/null
+++ b/scripts/perftest/python/google_docs/stats.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import argparse
+from functools import reduce
+import pprint
+from oauth2client.service_account import ServiceAccountCredentials
+import gspread
+
+# Get time difference between difference runs
+
+
+def auth(path, sheet_name):
+ """
+ Responsible for authorization
+ """
+ scope = ['https://spreadsheets.google.com/feeds']
+ creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)
+ gc = gspread.authorize(creds)
+ sheet = gc.open("Perf").worksheet(sheet_name)
+ return sheet
+
+
+def get_data(sheet, tag):
+ """
+ Get time and algorithm from the sheet
+ """
+ time = sheet.find('time_{}'.format(tag))
+ algo = sheet.find('algo_{}'.format(tag))
+
+ time_col = sheet.col_values(time.col)
+ time_col = list(filter(lambda x: len(x) > 0, time_col))
+
+ algo_col = sheet.col_values(algo.col)
+ algo_col = list(filter(lambda x: len(x) > 0, algo_col))
+ return algo_col, time_col
+
+
+def get_data_dict(data_col):
+ """
+ Return data as dictionary with key as algorithm and list time values
+ """
+ data_dict = {}
+ all_algo = []
+ for algo, _ in data_col:
+ all_algo.append(algo)
+
+ flatten_algo = reduce(lambda x, y: x+y, all_algo)
+
+ # remove the header
+ filter_data = list(filter(lambda x: not x.startswith('algo_'), flatten_algo))
+ distict_algos = set(filter_data)
+
+ for algo_dist in distict_algos:
+ for algo, time in data_col:
+ for k, v in zip(algo, time):
+ if algo_dist == k:
+ if algo_dist not in data_dict:
+ data_dict[k] = [v]
+ else:
+ data_dict[k].append(v)
+ return data_dict
+
+# Example Usage
+# ./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0
+if __name__ == '__main__':
+ execution_mode = ['hybrid_spark', 'singlenode']
+
+ cparser = argparse.ArgumentParser(description='System-ML Statistics Script')
+ cparser.add_argument('--auth', help='Location to read auth file',
+ required=True, metavar='')
+ cparser.add_argument('--exec-mode', help='Execution mode', choices=execution_mode,
+ required=True, metavar='')
+ cparser.add_argument('--tags', help='Tagging header value',
+ required=True, nargs='+')
+
+ args = cparser.parse_args()
+ arg_dict = vars(args)
+ sheet = auth(args.auth, args.exec_mode)
+ all_data = sheet.get_all_records()
+
+ data_col = []
+ for tag in args.tags:
+ algo_col, time_col = get_data(sheet, tag)
+ data_col.append((algo_col, time_col))
+
+ data_dict = get_data_dict(data_col)
+
+ delta_algo = {}
+ for k, v in data_dict.items():
+ delta = float(v[0]) - float(v[1])
+ delta_algo[k] = delta
+
+ pprint.pprint(delta_algo, width=1)
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/google_docs/update.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/google_docs/update.py b/scripts/perftest/python/google_docs/update.py
new file mode 100755
index 0000000..c2fed38
--- /dev/null
+++ b/scripts/perftest/python/google_docs/update.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import argparse
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+import pandas as pd
+
+# Update data to google sheets
+
+
+def parse_data(file_path):
+ """
+ Skip reading 1st row : Header
+ Skip reading last row : Footer
+ """
+ csv_file = pd.read_csv(file_path, sep=',', skiprows=1, skipfooter=1, engine='python')
+ algo = csv_file['INFO:root:algorithm'].apply(lambda x: x.split(':')[-1])
+ key = algo + '_'+ csv_file['run_type'] + '_' + csv_file['intercept'] + '_' + \
+ csv_file['matrix_type'] + '_' + csv_file['data_shape']
+ return key, csv_file['time_sec']
+
+
+def auth(path, sheet_name):
+ """
+ Responsible for authorization
+ """
+ scope = ['https://spreadsheets.google.com/feeds']
+ creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)
+ gc = gspread.authorize(creds)
+ sheet = gc.open("Perf").worksheet(sheet_name)
+ return sheet
+
+
+def insert_pair(algo, time, start_col, tag):
+ """
+ Wrapper function that calls insert_values to insert algo and time
+ """
+ insert_values(sheet, algo, start_col, 'algo_{}'.format(tag))
+ insert_values(sheet, time, start_col + 1, 'time_{}'.format(tag))
+ print('Writing Complete')
+
+
+def insert_values(sheet, key, col_num, header):
+ """
+ Insert data to google sheets based on the arguments
+ """
+ # Col Name
+ sheet.update_cell(1, col_num, header)
+ for id, val in enumerate(key):
+ sheet.update_cell(id + 2, col_num, val)
+
+
+def get_dim(sheet):
+ """
+ Get the dimensions of data
+ """
+ try:
+ col_count = sheet.get_all_records()
+ except:
+ col_count = [[]]
+ row = len(col_count)
+ col = len(col_count[0])
+ return row, col
+
+
+# Example Usage
+# ./update.py --file ../temp/test.out --exec-mode singlenode --auth client_json.json --tag 3.0
+if __name__ == '__main__':
+ execution_mode = ['hybrid_spark', 'singlenode']
+
+ cparser = argparse.ArgumentParser(description='System-ML Update / Stat Script')
+ cparser.add_argument('--file', help='Location of the current perf test outputs',
+ required=True, metavar='')
+ cparser.add_argument('--exec-mode', help='Backend Type', choices=execution_mode,
+ required=True, metavar='')
+ cparser.add_argument('--auth', help='Location to read auth file',
+ required=True, metavar='')
+ cparser.add_argument('--tag', help='Tagging header value',
+ required=True, metavar='')
+
+ args = cparser.parse_args()
+ arg_dict = vars(args)
+
+ # Authenticate and get sheet dimensions
+ sheet = auth(args.auth, args.exec_mode)
+ row, col = get_dim(sheet)
+
+ # Read data from file and write to google docs
+ algo, time = parse_data(args.file)
+ insert_pair(algo, time, col + 1, args.tag)
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py
index 92d3af4..d2e44e6 100755
--- a/scripts/perftest/python/predict.py
+++ b/scripts/perftest/python/predict.py
@@ -22,47 +22,42 @@
import sys
from os.path import join
-from utils import config_writer, relevant_folders, mat_type_check
+from utils_misc import config_writer, mat_type_check
+from utils_fs import relevant_folders
# Contains configuration setting for predicting
DATA_FORMAT = 'csv'
-def m_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def m_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
X = join(datagen_dir, 'X_test.data')
Y = join(datagen_dir, 'Y_test.data')
-
- icpt = save_file_name.split('.')[-1]
+ icpt = save_folder_name.split('.')[-1]
model = join(train_dir, 'model.data')
- fmt = DATA_FORMAT
-
- config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
-
- full_path_predict = join(predict_dir, save_file_name)
- config_writer(full_path_predict + '.json', config)
+ config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=DATA_FORMAT)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def l2_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def l2_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
X = join(datagen_dir, 'X_test.data')
Y = join(datagen_dir, 'Y_test.data')
-
- icpt = save_file_name.split('.')[-1]
+ icpt = save_folder_name.split('.')[-1]
model = join(train_dir, 'model.data')
- fmt = DATA_FORMAT
-
- config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
+ config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=DATA_FORMAT)
+ config_writer(save_path + '.json', config)
- full_path_predict = join(predict_dir, save_file_name)
- config_writer(full_path_predict + '.json', config)
+ return save_path
- return full_path_predict
+def multilogreg_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
-def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
X = join(datagen_dir, 'X_test.data')
Y = join(datagen_dir, 'Y_test.data')
B = join(train_dir, 'B.data')
@@ -70,156 +65,147 @@ def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
dfam = '3'
vpow = '-1'
link = '2'
- fmt = DATA_FORMAT
- config = dict(dfam=dfam, vpow=vpow, link=link, fmt=fmt, X=X, B=B, Y=Y, M=M)
+ config = dict(dfam=dfam, vpow=vpow, link=link, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M)
- full_path_predict = join(predict_dir, save_file_name)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def naive_bayes_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def naive_bayes_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
X = join(datagen_dir, 'X_test.data')
Y = join(datagen_dir, 'Y_test.data')
-
prior = join(train_dir, 'prior')
conditionals = join(train_dir, 'conditionals')
- fmt = DATA_FORMAT
probabilities = join(train_dir, 'probabilities')
- config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=fmt, probabilities=probabilities)
+ config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=DATA_FORMAT,
+ probabilities=probabilities)
+ config_writer(save_path + '.json', config)
- full_path_predict = join(predict_dir, save_file_name)
- config_writer(full_path_predict + '.json', config)
+ return save_path
- return full_path_predict
+def kmeans_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
-def kmeans_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+ save_path = join(config_dir, save_folder_name)
+ predict_write = join(predict_dir, save_folder_name)
X = join(datagen_dir, 'X_test.data')
C = join(datagen_dir, 'C.data')
- full_path_predict = join(predict_dir, save_file_name)
- prY = join(full_path_predict, 'prY.data')
+ prY = join(predict_write, 'prY.data')
config = dict(X=X, C=C, prY=prY)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def linearregcg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def linearregcg_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+
+ save_path = join(config_dir, save_folder_name)
+ predict_write = join(predict_dir, save_folder_name)
dfam = '1'
link = '1'
vpow = '0.0'
lpow = '1.0'
-
X = join(datagen_dir, 'X_test.data')
B = join(train_dir, 'B.data')
Y = join(datagen_dir, 'Y_test.data')
-
- full_path_predict = join(predict_dir, save_file_name)
- M = join(full_path_predict, 'M.data')
- O = join(full_path_predict, 'O.data')
-
+ M = join(predict_write, 'M.data')
+ O = join(predict_write, 'O.data')
config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
B=B, Y=Y, M=M, O=O)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def linearregds_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def linearregds_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
+ predict_write = join(predict_dir, save_folder_name)
dfam = '1'
link = '1'
vpow = '0.0'
lpow = '1.0'
-
X = join(datagen_dir, 'X_test.data')
B = join(train_dir, 'B.data')
Y = join(datagen_dir, 'Y_test.data')
-
- full_path_predict = join(predict_dir, save_file_name)
- M = join(full_path_predict, 'M.data')
- O = join(full_path_predict, 'O.data')
-
+ M = join(predict_write, 'M.data')
+ O = join(predict_write, 'O.data')
config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
B=B, Y=Y, M=M, O=O)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def glm_poisson_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def glm_poisson_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
+ predict_write = join(predict_dir, save_folder_name)
dfam = '1'
link = '1'
vpow = '1'
lpow = '1.0'
-
X = join(datagen_dir, 'X_test.data')
B = join(train_dir, 'B.data')
Y = join(datagen_dir, 'Y_test.data')
-
- full_path_predict = join(predict_dir, save_file_name)
- M = join(full_path_predict, 'M.data')
- O = join(full_path_predict, 'O.data')
+ M = join(predict_write, 'M.data')
+ O = join(predict_write, 'O.data')
config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
B=B, Y=Y, M=M, O=O)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def glm_binomial_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def glm_binomial_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
+ predict_write = join(predict_dir, save_folder_name)
dfam = '2'
link = '3'
-
X = join(datagen_dir, 'X_test.data')
B = join(train_dir, 'B.data')
Y = join(datagen_dir, 'Y_test.data')
-
- full_path_predict = join(predict_dir, save_file_name)
- M = join(full_path_predict, 'M.data')
- O = join(full_path_predict, 'O.data')
+ M = join(predict_write, 'M.data')
+ O = join(predict_write, 'O.data')
config = dict(dfam=dfam, link=link, fmt=DATA_FORMAT, X=X,
B=B, Y=Y, M=M, O=O)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def glm_gamma_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir):
+ save_path = join(config_dir, save_folder_name)
+ predict_write = join(predict_dir, save_folder_name)
dfam = '1'
link = '1'
vpow = '2'
lpow = '0'
-
X = join(datagen_dir, 'X_test.data')
B = join(train_dir, 'B.data')
Y = join(datagen_dir, 'Y_test.data')
-
- full_path_predict = join(predict_dir, save_file_name)
- M = join(full_path_predict, 'M.data')
- O = join(full_path_predict, 'O.data')
-
+ M = join(predict_write, 'M.data')
+ O = join(predict_write, 'O.data')
config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
B=B, Y=Y, M=M, O=O)
- config_writer(full_path_predict + '.json', config)
+ config_writer(save_path + '.json', config)
- return full_path_predict
+ return save_path
-def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos):
+def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos, config_dir):
"""
This function has two responsibilities. Generate the configuration files for
prediction algorithms and return a dictionary that will be used for execution.
@@ -246,6 +232,9 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir,
dense_algos: List
Algorithms that support only dense matrix type
+ config_dir: String
+ Location to store to configuration json file
+
return: Dictionary {string: list}
This dictionary contains algorithms to be executed as keys and the path of configuration
json files to be executed list of values.
@@ -267,6 +256,7 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir,
for current_train_folder in train_folders:
current_data_gen_dir = relevant_folders(datagen_dir, current_algo, current_family,
current_matrix_type, matrix_shape, 'data-gen')
+
if len(current_data_gen_dir) == 0:
print('data-gen folders not present for {}'.format(current_family))
sys.exit()
@@ -276,7 +266,7 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir,
# current_data_gen_dir has index 0 as we would expect one datagen for each algorithm
conf_path = globals()[algo_func](save_name, current_data_gen_dir[0],
- current_train_folder, predict_dir)
+ current_train_folder, predict_dir, config_dir)
config_bundle[current_algo].append(conf_path)
http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py
index 3360285..a15d7e6 100755
--- a/scripts/perftest/python/run_perftest.py
+++ b/scripts/perftest/python/run_perftest.py
@@ -31,9 +31,10 @@ from datetime import datetime
from datagen import config_packets_datagen
from train import config_packets_train
from predict import config_packets_predict
-from utils import get_families, config_reader, create_dir, get_existence, \
- exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
-
+from utils_misc import get_families, config_reader, \
+ exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics, args_dict_split, \
+ get_config_args
+from utils_fs import create_dir_local, write_success, check_SUCCESS_file_exists
# A packet is a dictionary
# with key as the algorithm
@@ -83,32 +84,35 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict',
DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2']
+sup_args_dict = {}
+
# Responsible for execution and metric logging
-def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode):
+def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode, current_dir):
"""
This function is responsible for overall workflow. This does the following actions
Check if the input is key value argument or list of positional args
Execution and time
Logging Metrics
-
- algo : String
+ algo: String
Input algorithm specified
- exec_type : String
+ exec_type: String
Contains the execution type singlenode / hybrid_spark
- config_path : String
+ config_path: String
Path to read the json file from
- dml_file_name : String
+ dml_file_name: String
DML file name to be used while processing the arguments give
- action_mode : String
+ action_mode: String
Type of action data-gen, train ...
- """
+ current_dir: String
+ Current location of hdfs / local temp being processed
+ """
config_data = config_reader(config_path + '.json')
if isinstance(config_data, dict):
@@ -122,26 +126,24 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode)
config_file_name = config_path.split('/')[-1]
mat_type, mat_shape, intercept = get_folder_metrics(config_file_name, action_mode)
- exit_flag_success = get_existence(config_path, action_mode)
+ temp_cwd = join(current_dir, config_file_name)
+
+ # temp_dir_exist
+ exit_flag_success = check_SUCCESS_file_exists(temp_cwd)
if exit_flag_success:
- print('data already exists {}'.format(config_path))
time = 'data_exists'
else:
- time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args)
-
- # Write a _SUCCESS file only if time is found and in data-gen action_mode
- if len(time.split('.')) == 2 and action_mode == 'data-gen':
- full_path = join(config_path, '_SUCCESS')
- open(full_path, 'w').close()
+ time = exec_dml_and_parse_time(exec_type, dml_file_name, args, spark_args_dict, sup_args_dict)
+ write_success(time, temp_cwd)
print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, mat_shape, time))
current_metrics = [algo, action_mode, intercept, mat_type, mat_shape, time]
logging.info(','.join(current_metrics))
+ return exit_flag_success
-# Perf test entry point
-def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode):
+def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir):
"""
This function is the entry point for performance testing
@@ -160,13 +162,15 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
mat_shape: List
Dimensions of the input matrix with rows and columns
- temp_dir: String
- Location to store all files created during perf test
+ config_dir: String
+ Location to store all configuration
mode: List
Type of workload to run. data-gen, train ...
- """
+ temp_dir: String
+ Location to store all output files created during perf test
+ """
# algos to run is a list of tuples with
# [(m-svm, binomial), (m-svm, multinomial)...]
# Basic block for execution of scripts
@@ -202,48 +206,64 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
algos_to_run.append((current_algo, current_family))
if 'data-gen' in mode:
+ # Create config directories
+ data_gen_config_dir = join(config_dir, 'data-gen')
+ create_dir_local(data_gen_config_dir)
+
+ # Create output path
data_gen_dir = join(temp_dir, 'data-gen')
- create_dir(data_gen_dir)
conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir,
- DENSE_TYPE_ALGOS)
+ DENSE_TYPE_ALGOS, data_gen_config_dir)
+
for family_name, config_folders in conf_packet.items():
for config in config_folders:
file_name = ML_GENDATA[family_name]
- algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen')
+ success_file = algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen', data_gen_dir)
# Statistic family do not require to be split
if family_name not in ['stats1', 'stats2']:
- exec_test_data(exec_type, config)
+ if not success_file:
+ exec_test_data(exec_type, spark_args_dict, sup_args_dict, data_gen_dir, config)
if 'train' in mode:
+ # Create config directories
+ train_config_dir = join(config_dir, 'train')
+ create_dir_local(train_config_dir)
+
+ # Create output path
data_gen_dir = join(temp_dir, 'data-gen')
train_dir = join(temp_dir, 'train')
- create_dir(train_dir)
+
conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir,
- train_dir, DENSE_TYPE_ALGOS)
+ train_dir, DENSE_TYPE_ALGOS, train_config_dir)
for algo_name, config_files in conf_packet.items():
for config in config_files:
file_name = ML_TRAIN[algo_name]
- algorithm_workflow(algo_name, exec_type, config, file_name, 'train')
+ algorithm_workflow(algo_name, exec_type, config, file_name, 'train', train_dir)
if 'predict' in mode:
+ # Create config directories
+ predict_config_dir = join(config_dir, 'predict')
+ create_dir_local(predict_config_dir)
+
+ # Create output path
data_gen_dir = join(temp_dir, 'data-gen')
train_dir = join(temp_dir, 'train')
predict_dir = join(temp_dir, 'predict')
- create_dir(predict_dir)
- algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run))
- if len(algos_to_run_perdict) < 1:
+
+ algos_to_run = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run))
+ if len(algos_to_run) < 1:
# No algorithms with predict found
pass
- conf_packet = config_packets_predict(algos_to_run_perdict, mat_type, mat_shape, data_gen_dir,
- train_dir, predict_dir, DENSE_TYPE_ALGOS)
-
+ conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape, data_gen_dir,
+ train_dir, predict_dir, DENSE_TYPE_ALGOS,
+ predict_config_dir)
for algo_name, config_files in conf_packet.items():
for config in config_files:
file_name = ML_PREDICT[algo_name]
- algorithm_workflow(algo_name, exec_type, config, file_name, 'predict')
+ algorithm_workflow(algo_name, exec_type, config, file_name, 'predict', predict_dir)
-if __name__ == '__main__':
+if __name__ == '__main__':
# sys ml env set and error handling
systemml_home = os.environ.get('SYSTEMML_HOME')
if systemml_home is None:
@@ -259,7 +279,6 @@ if __name__ == '__main__':
# Default temp directory, contains everything generated in perftest
default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
- create_dir(default_temp_dir)
# Initialize time
start_time = time.time()
@@ -274,7 +293,8 @@ if __name__ == '__main__':
all_families = ML_ALGO.keys()
# Argparse Module
- cparser = argparse.ArgumentParser(description='SystemML Performance Test Script')
+ cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ description='SystemML Performance Test Script')
cparser.add_argument('--family', help='space separated list of classes of algorithms '
'(available : ' + ', '.join(sorted(all_families)) + ')',
metavar='', choices=all_families, nargs='+')
@@ -290,17 +310,43 @@ if __name__ == '__main__':
nargs='+')
cparser.add_argument('--mat-shape', default=default_mat_shape, help='space separated list of shapes of matrices '
'to generate (e.g 10k_1k, 20M_4k)', metavar='', nargs='+')
- cparser.add_argument('--temp-dir', default=default_temp_dir, help='temporary directory '
+
+ cparser.add_argument('--config-dir', default=default_temp_dir, help='temporary directory '
'where generated, training and prediction data is put', metavar='')
cparser.add_argument('--filename', default='perf_test', help='name of the output file for the perf'
' metrics', metavar='')
cparser.add_argument('--mode', default=workload,
help='space separated list of types of workloads to run (available: data-gen, train, predict)',
metavar='', choices=workload, nargs='+')
+ # Change this to temp-dir
+ cparser.add_argument('--temp-dir', default=default_temp_dir,
+ help='define the file system to work on', metavar='')
+
+ # Configuration Options
+ cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, '
+ 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10',
+ metavar='')
+ cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, '
+ 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='')
+ cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='')
+
+ # Spark Configuration Option
+ cparser.add_argument('--master', help='local, yarn-client, yarn-cluster', metavar='')
+ cparser.add_argument('--driver-memory', help='Memory for driver (e.g. 512M)', metavar='')
+ cparser.add_argument('--num-executors', help='Number of executors to launch', metavar='')
+ cparser.add_argument('--executor-memory', help='Memory per executor', metavar='')
+ cparser.add_argument('--executor-cores', help='Number of cores', metavar='')
+ cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='')
# Args is a namespace
args = cparser.parse_args()
- arg_dict = vars(args)
+ all_arg_dict = vars(args)
+ arg_dict, config_dict, spark_dict = args_dict_split(all_arg_dict)
+
+ create_dir_local(args.config_dir)
+
+ # Global variables
+ sup_args_dict, spark_args_dict = get_config_args(config_dict, spark_dict, args.exec_type)
# Debug arguments
# print(arg_dict)
@@ -344,13 +390,12 @@ if __name__ == '__main__':
# Set level to 0 -> debug mode
# Set level to 20 -> Plain metrics
log_filename = args.filename + '_' + args.exec_type + '.out'
- logging.basicConfig(filename=join(default_temp_dir, log_filename), level=20)
+ logging.basicConfig(filename=join(args.config_dir, log_filename), level=20)
logging.info('New performance test started at {}'.format(time_now))
logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec')
# Remove filename item from dictionary as its already used to create the log above
del arg_dict['filename']
-
perf_test_entry(**arg_dict)
total_time = (time.time() - start_time)