You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/07/02 07:01:09 UTC
systemml git commit: [SYSTEMML-1451][GSoC Phase 1] Single script to
run perf tests
Repository: systemml
Updated Branches:
refs/heads/master 31952e47d -> e7cfcadc9
[SYSTEMML-1451][GSoC Phase 1] Single script to run perf tests
- Single entry point to run perf tests in any combination of algoriths,
families, matrix shapes & densities
- Reports time taken by a single perf test by parsing the output and
grep-ing for the time
- Detects tests that did not run and reports in the generated log
- Robust error handling and reporting, informative help message
Closes #537
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e7cfcadc
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e7cfcadc
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e7cfcadc
Branch: refs/heads/master
Commit: e7cfcadc9b0e72637c67c8d6a6dcc62f62ba5177
Parents: 31952e4
Author: krishnakalyan3 <kr...@gmail.com>
Authored: Sun Jul 2 00:00:49 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Sun Jul 2 00:00:49 2017 -0700
----------------------------------------------------------------------
scripts/perftest/python/datagen.py | 252 ++++++++++++++++
scripts/perftest/python/predict.py | 285 +++++++++++++++++++
scripts/perftest/python/run_perftest.py | 339 ++++++++++++++++++++++
scripts/perftest/python/train.py | 411 +++++++++++++++++++++++++++
scripts/perftest/python/utils.py | 296 +++++++++++++++++++
5 files changed, 1583 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py
new file mode 100755
index 0000000..d9c49e9
--- /dev/null
+++ b/scripts/perftest/python/datagen.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import itertools
+from os.path import join
+from utils import split_rowcol, config_writer
+
+# This file contains configuration settings for data generation
+DATA_FORMAT = 'csv'
+
+MATRIX_TYPE_DICT = {'dense': '0.9',
+ 'sparse': '0.01'}
+
+FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2']
+
+
+def multinomial_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
+ full_path = join(datagen_dir, path_name)
+
+ numSamples = row
+ numFeatures = col
+ sparsity = MATRIX_TYPE_DICT[matrix_type]
+ num_categories = '150'
+ intercept = '0'
+ X = join(full_path, 'X.data')
+ Y = join(full_path, 'Y.data')
+ fmt = DATA_FORMAT
+
+ config = [numSamples, numFeatures, sparsity, num_categories, intercept,
+ X, Y, fmt, '1']
+
+ config_writer(full_path + '.json', config)
+
+ return full_path
+
+
+def binomial_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
+ full_path = join(datagen_dir, path_name)
+
+ numSamples = row
+ numFeatures = col
+ maxFeatureValue = '5'
+ maxWeight = '5'
+ loc_weights = join(full_path, 'weight.data')
+ loc_data = join(full_path, 'X.data')
+ loc_labels = join(full_path, 'Y.data')
+ noise = '1'
+ intercept = '0'
+ sparsity = MATRIX_TYPE_DICT[matrix_type]
+ tranform_labels = '1'
+ fmt = DATA_FORMAT
+
+ config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
+ loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
+ config_writer(full_path + '.json', config)
+
+ return full_path
+
+
+def regression1_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
+ full_path = join(datagen_dir, path_name)
+
+ numSamples = row
+ numFeatures = col
+ maxFeatureValue = '5'
+ maxWeight = '5'
+ loc_weights = join(full_path, 'weight.data')
+ loc_data = join(full_path, 'X.data')
+ loc_labels = join(full_path, 'Y.data')
+ noise = '1'
+ intercept = '0'
+ sparsity = MATRIX_TYPE_DICT[matrix_type]
+ tranform_labels = '1'
+ fmt = DATA_FORMAT
+
+ config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
+ loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
+ config_writer(full_path + '.json', config)
+
+ return full_path
+
+
+def regression2_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
+ full_path = join(datagen_dir, path_name)
+
+ numSamples = row
+ numFeatures = col
+ maxFeatureValue = '5'
+ maxWeight = '5'
+ loc_weights = join(full_path, 'weight.data')
+ loc_data = join(full_path, 'X.data')
+ loc_labels = join(full_path, 'Y.data')
+ noise = '1'
+ intercept = '0'
+ sparsity = MATRIX_TYPE_DICT[matrix_type]
+ tranform_labels = '1'
+ fmt = DATA_FORMAT
+
+ config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data,
+ loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
+ config_writer(full_path + '.json', config)
+
+ return full_path
+
+
+def clustering_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)])
+
+ full_path = join(datagen_dir, path_name)
+ X = join(full_path, 'X.data')
+ Y = join(full_path, 'Y.data')
+ YbyC = join(full_path, 'YbyC.data')
+ C = join(full_path, 'C.data')
+ nc = '50'
+ dc = '10.0'
+ dr = '1.0'
+ fbf = '100.0'
+ cbf = '100.0'
+
+ config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, C=C, Y=Y,
+ YbyC=YbyC, fmt=DATA_FORMAT)
+
+ config_writer(full_path + '.json', config)
+ return full_path
+
+
+def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)])
+ full_path = join(datagen_dir, path_name)
+
+ DATA = join(full_path, 'X.data')
+ TYPES = join(full_path, 'types')
+ TYPES1 = join(full_path, 'set1.types')
+ TYPES2 = join(full_path, 'set2.types')
+ INDEX1 = join(full_path, 'set1.indices')
+ INDEX2 = join(full_path, 'set2.indices')
+ MAXDOMAIN = '1100'
+ SETSIZE = '20'
+ LABELSETSIZE = '10'
+
+ # NC should be less than C and more than num0
+ # NC = 10 (old value)
+ # num0 = NC/2
+ # num0 < NC < C
+ # NC = C/2
+ NC = int(int(col)/2)
+
+ config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, TYPES=TYPES, SETSIZE=SETSIZE,
+ LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2,
+ fmt=DATA_FORMAT)
+
+ config_writer(full_path + '.json', config)
+
+ return full_path
+
+
+def stats2_datagen(matrix_dim, matrix_type, datagen_dir):
+
+ row, col = split_rowcol(matrix_dim)
+ path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)])
+ full_path = join(datagen_dir, path_name)
+
+ D = join(full_path, 'X.data')
+ Xcid = join(full_path, 'Xcid.data')
+ Ycid = join(full_path, 'Ycid.data')
+ A = join(full_path, 'A.data')
+
+ config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid,
+ A=A, fmt=DATA_FORMAT)
+
+ config_writer(full_path + '.json', config)
+ return full_path
+
+
+def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir):
+ """
+ This function has two responsibilities. Generate the configuration files for
+ datagen algorithms and return a dictionary that will be used for execution.
+
+ algo_payload : List of tuples
+ The first tuple index contains algorithm name and the second index contains
+ family type.
+
+ matrix_type: String
+ Type of matrix to generate e.g dense or sparse
+
+ matrix_shape: String
+ Shape of matrix to generate e.g 100k_10
+
+ return: Dictionary {string: list}
+ This dictionary contains algorithms to be executed as keys and the path of configuration
+ json files to be executed list of values.
+ """
+
+ config_bundle = {}
+
+ distinct_families = set(map(lambda x: x[1], algo_payload))
+
+ # Cross Product of all configurations
+ for current_family in distinct_families:
+ if current_family in FAMILY_NO_MATRIX_TYPE:
+ config = list(itertools.product(matrix_shape, ['dense']))
+ config_bundle[current_family] = config
+ else:
+ config = list(itertools.product(matrix_shape, matrix_type))
+ # clustering : [[10k_1, dense], [10k_2, dense], ...]
+ config_bundle[current_family] = config
+
+ config_packets = {}
+ for current_family, configs in config_bundle.items():
+ config_packets[current_family] = []
+ for size, type in configs:
+ family_func = current_family.lower() + '_datagen'
+ conf_path = globals()[family_func](size, type, datagen_dir)
+ config_packets[current_family].append(conf_path)
+
+ return config_packets
http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py
new file mode 100755
index 0000000..bc034da
--- /dev/null
+++ b/scripts/perftest/python/predict.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for dadditional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import sys
+import os
+from os.path import join
+import glob
+from utils import create_dir, config_writer
+
+# Contains configuration setting for predicting
+DATA_FORMAT = 'csv'
+
+
+def m_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ X = join(datagen_dir, 'X_test.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ icpt = save_file_name.split('.')[-1]
+ model = join(train_dir, 'model.data')
+ fmt = DATA_FORMAT
+
+ config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
+
+ full_path_predict = join(predict_dir, save_file_name)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def l2_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ X = join(datagen_dir, 'X_test.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ icpt = save_file_name.split('.')[-1]
+ model = join(train_dir, 'model.data')
+ fmt = DATA_FORMAT
+
+ config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
+
+ full_path_predict = join(predict_dir, save_file_name)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+ X = join(datagen_dir, 'X_test.data')
+ Y = join(datagen_dir, 'Y_test.data')
+ B = join(train_dir, 'B.data')
+ M = join(train_dir, 'M.data')
+ dfam = '3'
+ vpow = '-1'
+ link = '2'
+ fmt = DATA_FORMAT
+
+ config = dict(dfam=dfam, vpow=vpow, link=link, fmt=fmt, X=X, B=B, Y=Y, M=M)
+
+ full_path_predict = join(predict_dir, save_file_name)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def naive_bayes_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ X = join(datagen_dir, 'X_test.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ prior = join(train_dir, 'prior')
+ conditionals = join(train_dir, 'conditionals')
+ fmt = DATA_FORMAT
+ probabilities = join(train_dir, 'probabilities')
+ config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=fmt, probabilities=probabilities)
+
+ full_path_predict = join(predict_dir, save_file_name)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def kmeans_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ X = join(datagen_dir, 'X_test.data')
+ C = join(datagen_dir, 'C.data')
+
+ full_path_predict = join(predict_dir, save_file_name)
+ prY = join(full_path_predict, 'prY.data')
+
+ config = dict(X=X, C=C, prY=prY)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def linearregcg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ dfam = '1'
+ link = '1'
+ vpow = '0.0'
+ lpow = '1.0'
+
+ X = join(datagen_dir, 'X_test.data')
+ B = join(train_dir, 'B.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ full_path_predict = join(predict_dir, save_file_name)
+ M = join(full_path_predict, 'M.data')
+ O = join(full_path_predict, 'O.data')
+
+ config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
+ B=B, Y=Y, M=M, O=O)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def linearregds_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ dfam = '1'
+ link = '1'
+ vpow = '0.0'
+ lpow = '1.0'
+
+ X = join(datagen_dir, 'X_test.data')
+ B = join(train_dir, 'B.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ full_path_predict = join(predict_dir, save_file_name)
+ M = join(full_path_predict, 'M.data')
+ O = join(full_path_predict, 'O.data')
+
+ config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
+ B=B, Y=Y, M=M, O=O)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def glm_poisson_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ dfam = '1'
+ link = '1'
+ vpow = '1'
+ lpow = '1.0'
+
+ X = join(datagen_dir, 'X_test.data')
+ B = join(train_dir, 'B.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ full_path_predict = join(predict_dir, save_file_name)
+ M = join(full_path_predict, 'M.data')
+ O = join(full_path_predict, 'O.data')
+
+ config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
+ B=B, Y=Y, M=M, O=O)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def glm_binomial_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ dfam = '2'
+ link = '3'
+
+ X = join(datagen_dir, 'X_test.data')
+ B = join(train_dir, 'B.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ full_path_predict = join(predict_dir, save_file_name)
+ M = join(full_path_predict, 'M.data')
+ O = join(full_path_predict, 'O.data')
+
+ config = dict(dfam=dfam, link=link, fmt=DATA_FORMAT, X=X,
+ B=B, Y=Y, M=M, O=O)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+ dfam = '1'
+ link = '1'
+ vpow = '2'
+ lpow = '0'
+
+ X = join(datagen_dir, 'X_test.data')
+ B = join(train_dir, 'B.data')
+ Y = join(datagen_dir, 'Y_test.data')
+
+ full_path_predict = join(predict_dir, save_file_name)
+ M = join(full_path_predict, 'M.data')
+ O = join(full_path_predict, 'O.data')
+
+ config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X,
+ B=B, Y=Y, M=M, O=O)
+ config_writer(full_path_predict + '.json', config)
+
+ return full_path_predict
+
+
+def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
+ """
+ This function has two responsibilities. Generate the configuration files for
+ prediction algorithms and return a dictionary that will be used for execution.
+
+ algo_payload : List of tuples
+ The first tuple index contains algorithm name and the second index contains
+ family type.
+
+ datagen_dir: String
+ Path of the data generation directory
+
+ train_dir: String
+ Path of the training directory
+
+ predict_dir: String
+ Path of the prediction directory
+
+ return: Dictionary {string: list}
+ This dictionary contains algorithms to be executed as keys and the path of configuration
+ json files to be executed list of values.
+ """
+
+ algo_payload_distinct = set(map(lambda x: x[0], algo_payload))
+
+ config_bundle = {}
+
+ for k, v in algo_payload:
+ config_bundle[k] = []
+
+ for current_algo in algo_payload_distinct:
+ # Get all train folders related to the algorithm
+ train_path = join(train_dir, current_algo)
+ train_subdir = glob.glob(train_path + "*")
+ train_folders = list(filter(lambda x: os.path.isdir(x), train_subdir))
+
+ if len(train_folders) == 0:
+ print('training folders not present for {}'.format(current_algo))
+ sys.exit()
+
+ for current_train_folder in train_folders:
+ save_name = current_train_folder.split('/')[-1]
+ # Get all datagen folders
+ data_gen_folder_name = '.'.join(save_name.split('.')[1:-1])
+ data_gen_path = join(datagen_dir, data_gen_folder_name)
+ data_gen_subdir = glob.glob(data_gen_path + "*")
+ data_gen_folder = list(filter(lambda x: os.path.isdir(x), data_gen_subdir))
+
+ if len(data_gen_folder) == 0:
+ print('data-gen folders not present for {}'.format(current_family))
+ sys.exit()
+
+ # Ideally we will have more than one datagen directory to be found
+ current_data_gen_dir = list(data_gen_folder)[0]
+
+ algo_func = '_'.join([current_algo.lower().replace('-', '_')] + ['predict'])
+ conf_path = globals()[algo_func](save_name, current_data_gen_dir,
+ current_train_folder, predict_dir)
+
+ config_bundle[current_algo].append(conf_path)
+
+ return config_bundle
http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py
new file mode 100755
index 0000000..1421c2c
--- /dev/null
+++ b/scripts/perftest/python/run_perftest.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import sys
+import time
+import argparse
+from functools import reduce
+import os
+from os.path import join
+from utils import get_families, config_reader, create_dir, get_existence, \
+ exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
+import logging
+from datetime import datetime
+from datagen import config_packets_datagen
+from train import config_packets_train
+from predict import config_packets_predict
+
+# A packet is a dictionary
+# with key as the algorithm
+# value as the list with configuration json files
+
+
+ML_ALGO = {'binomial': ['MultiLogReg', 'l2-svm', 'm-svm'],
+ 'clustering': ['Kmeans'],
+ 'multinomial': ['naive-bayes', 'MultiLogReg', 'm-svm'],
+ 'regression1': ['LinearRegDS', 'LinearRegCG'],
+ 'regression2': ['GLM_poisson', 'GLM_gamma', 'GLM_binomial'],
+ 'stats1': ['Univar-Stats', 'bivar-stats'],
+ 'stats2': ['stratstats']}
+
+ML_GENDATA = {'binomial': 'genRandData4LogisticRegression',
+ 'clustering': 'genRandData4Kmeans',
+ 'multinomial': 'genRandData4Multinomial',
+ 'regression1': 'genRandData4LogisticRegression',
+ 'regression2': 'genRandData4LogisticRegression',
+ 'stats1': 'genRandData4DescriptiveStats',
+ 'stats2': 'genRandData4StratStats'}
+
+ML_TRAIN = {'GLM_poisson': 'GLM',
+ 'GLM_gamma': 'GLM',
+ 'GLM_binomial': 'GLM',
+ 'LinearRegCG': 'LinearRegCG',
+ 'LinearRegDS': 'LinearRegDS',
+ 'stratstats': 'stratstats',
+ 'Univar-Stats': 'Univar-Stats',
+ 'bivar-stats': 'bivar-stats',
+ 'Kmeans': 'Kmeans',
+ 'm-svm': 'm-svm',
+ 'l2-svm': 'l2-svm',
+ 'MultiLogReg': 'MultiLogReg',
+ 'naive-bayes': 'naive-bayes'}
+
+ML_PREDICT = {'Kmeans': 'Kmeans-predict',
+ 'LinearRegCG': 'GLM-predict',
+ 'LinearRegDS': 'GLM-predict',
+ 'm-svm': 'm-svm-predict',
+ 'l2-svm': 'l2-svm-predict',
+ 'MultiLogReg': 'GLM-predict',
+ 'naive-bayes': 'naive-bayes-predict',
+ 'GLM_poisson': 'GLM-predict',
+ 'GLM_gamma': 'GLM-predict',
+ 'GLM_binomial': 'GLM-predict'}
+
+
+# Responsible for execution and metric logging
+def algorithm_workflow(algo, exec_type, config_path, file_name, action_mode):
+ """
+ This function is responsible for overall workflow. This does the following actions
+ Check if the input is key value argument or list of positional args
+ Execution and time
+ Logging Metrics
+
+
+ algo : String
+ Input algorithm specified
+
+ exec_type : String
+ Contains the execution type singlenode / hybrid_spark
+
+ config_path : String
+ Path to read the json file from
+
+ file_name : String
+ DML file name to be used while processing the arguments give
+
+ action_mode : String
+ Type of action data-gen, train ...
+ """
+
+ config_data = config_reader(config_path + '.json')
+
+ if isinstance(config_data, dict):
+ dict_args = ' '.join([str(key) + '=' + str(val) for key, val in config_data.items()])
+ args = {'-nvargs': dict_args}
+
+ if isinstance(config_data, list):
+ list_args = ' '.join(config_data)
+ args = {'-args': list_args}
+
+ folder_name = config_path.split('/')[-1]
+ mat_type, mat_shape, intercept = get_folder_metrics(folder_name, action_mode)
+
+ exit_flag_success = get_existence(config_path, action_mode)
+
+ if exit_flag_success:
+ print('data already exists {}'.format(config_path))
+ time = 'data_exists'
+ else:
+ time = exec_dml_and_parse_time(exec_type, file_name, args)
+
+ # Write a _SUCCESS file only if time is found and in data-gen action_mode
+ if len(time.split('.')) == 2 and action_mode == 'data-gen':
+ full_path = join(config_path, '_SUCCESS')
+ open(full_path, 'w').close()
+
+ print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, mat_shape, time))
+ current_metrics = [algo, action_mode, intercept, mat_type, mat_shape, time]
+ logging.info(','.join(current_metrics))
+
+
+# Perf test entry point
+def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode):
+ """
+ This function is the entry point for performance testing
+
+ family: List
+ A family may contain one or more algorithm based on data generation script used
+
+ algo: List
+ Input algorithms
+
+ exec_type: String
+ Contains the execution type singlenode / hybrid_spark
+
+ mat_type: List
+ Type of matrix to generate dense or sparse
+
+ mat_shape: List
+ Dimensions of the input matrix with rows and columns
+
+ temp_dir: String
+ Location to store all files created during perf test
+
+ mode: List
+ Type of workload to run. data-gen, train ...
+ """
+
+ # algos to run is a list of tuples with
+ # [(m-svm, binomial), (m-svm, multinomial)...]
+ # Basic block for execution of scripts
+ algos_to_run = []
+
+ # Sections below build algos_to_run in our performance test
+ # Handles algorithms like m-svm and MultiLogReg which have multiple
+ # data generation scripts (dual datagen)
+ # --family is taken into consideration only when there are multiple datagen for an algo
+
+ if family is not None and algo is not None:
+ for current_algo in algo:
+ family_list = get_families(current_algo, ML_ALGO)
+ if len(family_list) == 1:
+ algos_to_run.append((current_algo, family_list[0]))
+ else:
+ intersection = set(family).intersection(family_list)
+ for valid_family in intersection:
+ algos_to_run.append((current_algo, valid_family))
+
+ # When the user inputs just algorithms to run
+ elif algo is not None:
+ for current_algo in algo:
+ family_list = get_families(current_algo, ML_ALGO)
+ for f in family_list:
+ algos_to_run.append((current_algo, f))
+
+ # When the user just specifies only families to run
+ elif family is not None:
+ for current_family in family:
+ algos = ML_ALGO[current_family]
+ for current_algo in algos:
+ algos_to_run.append((current_algo, current_family))
+
+ if 'data-gen' in mode:
+ data_gen_dir = join(temp_dir, 'data-gen')
+ create_dir(data_gen_dir)
+ conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir)
+ for family_name, config_folders in conf_packet.items():
+ for config in config_folders:
+ file_name = ML_GENDATA[family_name]
+ algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen')
+
+ # Statistic family do not require to be split
+ if family_name not in ['stats1', 'stats2']:
+ exec_test_data(exec_type, config)
+
+ if 'train' in mode:
+ data_gen_dir = join(temp_dir, 'data-gen')
+ train_dir = join(temp_dir, 'train')
+ create_dir(train_dir)
+ conf_packet = config_packets_train(algos_to_run, data_gen_dir, train_dir)
+ for algo_name, config_files in conf_packet.items():
+ for config in config_files:
+ file_name = ML_TRAIN[algo_name]
+ algorithm_workflow(algo_name, exec_type, config, file_name, 'train')
+
+ if 'predict' in mode:
+ data_gen_dir = join(temp_dir, 'data-gen')
+ train_dir = join(temp_dir, 'train')
+ predict_dir = join(temp_dir, 'predict')
+ create_dir(predict_dir)
+ algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run))
+ if len(algos_to_run_perdict) < 0:
+ pass
+ conf_packet = config_packets_predict(algos_to_run_perdict, data_gen_dir, train_dir, predict_dir)
+ for algo_name, config_files in conf_packet.items():
+ for config in config_files:
+ file_name = ML_PREDICT[algo_name]
+ algorithm_workflow(algo_name, exec_type, config, file_name, 'predict')
+
+if __name__ == '__main__':
+
+ # sys ml env set and error handling
+ systemml_home = os.environ.get('SYSTEMML_HOME')
+ if systemml_home is None:
+ print('SYSTEMML_HOME not found')
+ sys.exit()
+
+ # Default Arguments
+ default_mat_type = ['dense', 'sparse']
+ default_workload = ['data-gen', 'train', 'predict']
+ default_mat_shape = ['10k_100']
+ default_execution_mode = ['hybrid_spark', 'singlenode']
+
+ # Default temp directory, contains everything generated in perftest
+ default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
+ create_dir(default_temp_dir)
+
+ # Initialize time
+ start_time = time.time()
+
+ # Default Date Time
+ time_now = str(datetime.now())
+
+ # Remove duplicates algorithms and used as default inputs
+ all_algos = set(reduce(lambda x, y: x + y, ML_ALGO.values()))
+
+ # Argparse Module
+ cparser = argparse.ArgumentParser(description='SystemML Performance Test Script')
+ cparser.add_argument('--family', help='specify class of algorithms (e.g regression, binomial)',
+ metavar='', choices=ML_ALGO.keys(), nargs='+')
+ cparser.add_argument('--algo', help='specify the type of algorithm to run (Overrides --family)', metavar='',
+ choices=all_algos, nargs='+')
+
+ cparser.add_argument('--exec-type', default='singlenode', help='System-ML backend '
+ '(e.g singlenode, spark-hybrid)', metavar='',
+ choices=default_execution_mode)
+ cparser.add_argument('--mat-type', default=default_mat_type, help='type of matrix to generate '
+ '(e.g dense or sparse)', metavar='', choices=default_mat_type,
+ nargs='+')
+ cparser.add_argument('--mat-shape', default=default_mat_shape, help='shape of matrix '
+ 'to generate (e.g 10k_1k)', metavar='', nargs='+')
+ cparser.add_argument('--temp-dir', default=default_temp_dir, help='specify temporary directory',
+ metavar='')
+ cparser.add_argument('--filename', default='perf_test', help='specify output file for the perf'
+ ' metics', metavar='')
+ cparser.add_argument('--mode', default=default_workload,
+ help='specify type of workload to run (e.g data-gen, train, predict)',
+ metavar='', choices=default_workload, nargs='+')
+
+ # Args is a namespace
+ args = cparser.parse_args()
+ arg_dict = vars(args)
+
+ # Debug arguments
+ # print(arg_dict)
+
+ # Check for validity of input arguments
+ if args.family is not None:
+ for fam in args.family:
+ if fam not in ML_ALGO.keys():
+ print('{} family not present in the performance test suit'.format(fam))
+ sys.exit()
+
+ if args.algo is not None:
+ for algo in args.algo:
+ if algo not in all_algos:
+ print('{} algorithm not present in the performance test suit'.format(args.algo))
+ sys.exit()
+
+ # This section check the validity of dual datagen algorithms like m-svm
+ algo_families = {}
+ for current_algo in args.algo:
+ algo_families[current_algo] = get_families(current_algo, ML_ALGO)
+
+ if len(algo_families[current_algo]) > 1:
+ if args.family is None:
+ print('family should be present for {}'.format(current_algo))
+ sys.exit()
+
+ valid_families = set(algo_families[current_algo])
+ input_families = set(args.family)
+ common_families = input_families.intersection(valid_families)
+ if len(common_families) == 0:
+ print('Please specify a valid family for {} and the '
+ 'valid families are {}'.format(current_algo, ' '.join(valid_families)))
+ sys.exit()
+
+ # Set level to 0 -> debug mode
+ # Set level to 20 -> Plain metrics
+ log_filename = args.filename + '_' + args.exec_type + '.out'
+ logging.basicConfig(filename=join(default_temp_dir, log_filename), level=20)
+ logging.info('New performance test started at {}'.format(time_now))
+ logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec')
+
+ # Remove filename item from dictionary as its already used to create the log above
+ del arg_dict['filename']
+
+ perf_test_entry(**arg_dict)
+
+ total_time = (time.time() - start_time)
+ logging.info('Performance tests complete {0:.3f} secs \n'.format(total_time))
http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/train.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py
new file mode 100755
index 0000000..1ab2880
--- /dev/null
+++ b/scripts/perftest/python/train.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import sys
+import glob
+import os
+from os.path import join
+from utils import config_writer
+from functools import reduce
+
+# Contains configuration setting for training
+DATA_FORMAT = 'csv'
+
+
+def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+ for i in [0, 1]:
+ icpt = str(i)
+ reg = '0.01'
+ tol = '0.0001'
+ maxiter = 20
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ data_folders.append(full_path_train)
+
+ model = join(full_path_train, 'model.data')
+ Log = join(full_path_train, 'Log.data')
+
+ config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter, model=model,
+ Log=Log, fmt=DATA_FORMAT)
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def binomial_l2_svm_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+ for i in [0, 1]:
+ icpt = str(i)
+ reg = '0.01'
+ tol = '0.0001'
+ maxiter = '100'
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ data_folders.append(full_path_train)
+
+ model = join(full_path_train, 'model.data')
+ Log = join(full_path_train, 'Log.data')
+
+ config = dict(X=X, Y=Y, icpt=icpt, reg=reg, tol=tol, maxiter=maxiter, model=model,
+ Log=Log, fmt=DATA_FORMAT)
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def binomial_multilogreg_train(save_folder_name, datagen_dir, train_dir):
+ data_folders = []
+
+ for i in [0, 1, 2]:
+ icpt = str(i)
+ reg = '0.01'
+ tol = '0.0001'
+ moi = '100'
+ mii = '5'
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ data_folders.append(full_path_train)
+
+ B = join(full_path_train, 'B.data')
+
+ config = dict(X=X, Y=Y, icpt=icpt, reg=reg, tol=tol, moi=moi, mii=mii,
+ B=B)
+ config_writer(full_path_train + '.json', config)
+ return data_folders
+
+
+def multinomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+ for i in [0, 1]:
+ icpt = str(i)
+ reg = '0.01'
+ tol = '0.0001'
+ maxiter = '20'
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ model = join(full_path_train, 'model.data')
+ Log = join(full_path_train, 'Log.data')
+
+ config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter, model=model,
+ Log=Log, fmt=DATA_FORMAT)
+ config_writer(full_path_train + '.json', config)
+ data_folders.append(full_path_train)
+
+ return data_folders
+
+
+def clustering_kmeans_train(save_folder_name, datagen_dir, train_dir):
+
+ X = join(datagen_dir, 'X.data')
+
+ full_path_train = join(train_dir, save_folder_name)
+ C = join(full_path_train, 'C.data')
+ k = '50'
+ maxi = '50'
+ tol = '0.0001'
+ config = dict(X=X, k=k, maxi=maxi, tol=tol, C=C)
+
+ config_writer(full_path_train + '.json', config)
+
+ return [full_path_train]
+
+
+def stats1_univar_stats_train(save_folder_name, datagen_dir, train_dir):
+
+ X = join(datagen_dir, 'X.data')
+ TYPES = join(datagen_dir, 'types')
+
+ full_path_train = join(train_dir, save_folder_name)
+ STATS = join(full_path_train, 'STATS.data')
+
+ config = dict(X=X, TYPES=TYPES, STATS=STATS)
+ config_writer(full_path_train + '.json', config)
+
+ return [full_path_train]
+
+
+def stats1_bivar_stats_train(save_folder_name, datagen_dir, train_dir):
+
+ X = join(datagen_dir, 'X.data')
+ index1 = join(datagen_dir, 'set1.indices')
+ index2 = join(datagen_dir, 'set2.indices')
+ types1 = join(datagen_dir, 'set1.types')
+ types2 = join(datagen_dir, 'set2.types')
+
+ full_path_train = join(train_dir, save_folder_name)
+ OUTDIR = full_path_train
+
+ config = dict(X=X, index1=index1, index2=index2, types1=types1, types2=types2, OUTDIR=OUTDIR)
+ config_writer(full_path_train + '.json', config)
+ return [full_path_train]
+
+
+def stats2_stratstats_train(save_folder_name, datagen_dir, train_dir):
+
+ X = join(datagen_dir, 'X.data')
+ Xcid = join(datagen_dir, 'Xcid.data')
+ Ycid = join(datagen_dir, 'Ycid.data')
+
+ full_path_train = join(train_dir, save_folder_name)
+ O = join(full_path_train, 'O.data')
+
+ config = dict(X=X, Xcid=Xcid, Ycid=Ycid, O=O, fmt=DATA_FORMAT)
+
+ config_writer(full_path_train + '.json', config)
+
+ return [full_path_train]
+
+
+def multinomial_naive_bayes_train(save_folder_name, datagen_dir, train_dir):
+
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+ classes = '150'
+
+ full_path_train = join(train_dir, save_folder_name)
+ prior = join(full_path_train, 'prior')
+ conditionals = join(full_path_train, 'conditionals')
+ accuracy = join(full_path_train, 'accuracy')
+ fmt = DATA_FORMAT
+ probabilities = join(full_path_train, 'probabilities')
+
+ config = dict(X=X, Y=Y, classes=classes, prior=prior, conditionals=conditionals,
+ accuracy=accuracy, fmt=fmt, probabilities=probabilities)
+
+ config_writer(full_path_train + '.json', config)
+
+ return [full_path_train]
+
+
+def multinomial_multilogreg_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+ for i in [0, 1, 2]:
+ icpt = str(i)
+ reg = '0.01'
+ tol = '0.0001'
+ moi = '100'
+ mii = '0'
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ data_folders.append(full_path_train)
+ B = join(full_path_train, 'B.data')
+
+ config = dict(X=X, Y=Y, B=B, icpt=icpt, reg=reg, tol=tol, moi=moi, mii=mii, fmt=DATA_FORMAT)
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def regression1_linearregds_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+ for i in [0, 1, 2]:
+ icpt = str(i)
+ reg = '0.01'
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ data_folders.append(full_path_train)
+ B = join(full_path_train, 'B.data')
+
+ config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=DATA_FORMAT, reg=reg)
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def regression1_linearregcg_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+ for i in [0, 1, 2]:
+ icpt = str(i)
+ reg = '0.01'
+ tol = '0.0001'
+ maxi = '20'
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+ data_folders.append(full_path_train)
+ B = join(full_path_train, 'B.data')
+
+ config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=DATA_FORMAT, maxi=maxi, tol=tol, reg=reg)
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def regression2_glm_gamma_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+
+ for i in [0, 1, 2]:
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name)
+ data_folders.append(full_path_train)
+
+ B = join(full_path_train, 'B.data')
+ icpt = str(i)
+ fmt = DATA_FORMAT
+ moi = '200'
+ mii = '5'
+ dfam = '1'
+ vpow = '2.0'
+ link = '1'
+ lpow = '0.0'
+ tol = '0.0001'
+ reg = '0.01'
+ config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii, dfam=dfam,
+ vpov=vpow, link=link, lpow=lpow, tol=tol, reg=reg)
+
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def regression2_glm_binomial_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+
+ for i in [0, 1, 2]:
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name)
+ data_folders.append(full_path_train)
+
+ B = join(full_path_train, 'B.data')
+ icpt = str(i)
+ fmt = DATA_FORMAT
+ moi = '200'
+ mii = '5'
+ dfam = '2'
+ link = '3'
+ yneg = '2'
+ tol = '0.0001'
+ reg = '0.01'
+ config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii,
+ dfam=dfam, link=link, yneg=yneg, tol=tol, reg=reg)
+
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def regression2_glm_poisson_train(save_folder_name, datagen_dir, train_dir):
+
+ data_folders = []
+
+ for i in [0, 1, 2]:
+ X = join(datagen_dir, 'X.data')
+ Y = join(datagen_dir, 'Y.data')
+
+ full_path_train = join(train_dir, save_folder_name)
+ data_folders.append(full_path_train)
+
+ B = join(full_path_train, 'B.data')
+ icpt = str(i)
+ fmt = DATA_FORMAT
+ moi = '200'
+ mii = '5'
+ dfam = '1'
+ vpov = '1'
+ link = '1'
+ lpow = '0'
+ tol = '0.0001'
+ reg = '0.01'
+ config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii,
+ dfam=dfam, vpov=vpov, link=link, lpow=lpow, tol=tol, reg=reg)
+ config_writer(full_path_train + '.json', config)
+
+ return data_folders
+
+
+def config_packets_train(algo_payload, datagen_dir, train_dir):
+ """
+ This function has two responsibilities. Generate the configuration files for
+ input training algorithms and return a dictionary that will be used for execution.
+
+ algo_payload : List of tuples
+ The first tuple index contains algorithm name and the second index contains
+ family type.
+
+ datagen_dir: String
+ Path of the data generation directory
+
+ train_dir: String
+ Path of the training directory
+
+ return: {string: list}
+ This dictionary contains algorithms to be executed as keys and the path of configuration
+ json files to be executed list of values.
+
+ """
+
+ config_bundle = {}
+
+ for k, v in algo_payload:
+ config_bundle[k] = []
+
+ for current_algo, current_family in algo_payload:
+ data_gen_path = join(datagen_dir, current_family)
+ data_gen_subdir = glob.glob(data_gen_path + "*")
+
+ # Filter for specific data gen
+ data_gen_folders = list(filter(lambda x: os.path.isdir(x), data_gen_subdir))
+ if len(data_gen_folders) == 0:
+ print('datagen folders not present for {}'.format(current_family))
+ sys.exit()
+
+ for current_folder in data_gen_folders:
+ file_path_last = current_folder.split('/')[-1]
+ save_name = '.'.join([current_algo] + [file_path_last])
+ algo_func = '_'.join([current_family] + [current_algo.lower().replace('-', '_')]
+ + ['train'])
+ conf_path = globals()[algo_func](save_name, current_folder, train_dir)
+ config_bundle[current_algo].append(conf_path)
+
+ config_packets = {}
+
+ # Flatten
+ for current_algo, current_family in config_bundle.items():
+ config_packets[current_algo] = reduce(lambda x, y: x + y, current_family)
+
+ return config_packets
http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/utils.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py
new file mode 100755
index 0000000..7ff3b54
--- /dev/null
+++ b/scripts/perftest/python/utils.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from os.path import join
+import os
+import json
+import subprocess
+import shlex
+import re
+import logging
+
+# This file contains all the utility functions required for performance test module
+
+
+def get_families(current_algo, ML_ALGO):
+ """
+ Given current algorithm we get its families.
+
+ current_algo : String
+ Input algorithm specified
+
+ ml_algo : Dictionary
+ key, value dictionary with family as key and algorithms as list of values
+
+ return: List
+ List of families returned
+ """
+
+ family_list = []
+ for family, algos in ML_ALGO.items():
+ if current_algo in algos:
+ family_list.append(family)
+ return family_list
+
+
+def split_rowcol(matrix_dim):
+ """
+ Split the input matrix dimensions into row and columns
+
+ matrix_dim: String
+ Input concatenated string with row and column
+
+ return: Tuple
+ Row and column split based on suffix
+ """
+
+ k = str(0) * 3
+ M = str(0) * 6
+ replace_M = matrix_dim.replace('M', str(M))
+ replace_k = replace_M.replace('k', str(k))
+ row, col = replace_k.split('_')
+ return row, col
+
+
+def config_writer(write_path, config_obj):
+ """
+ Writes the dictionary as an configuration json file to the give path
+
+ write_path: String
+ Absolute path of file name to be written
+
+ config_obj: List or Dictionary
+ Can be a dictionary or a list based on the object passed
+ """
+
+ with open(write_path, 'w') as input_file:
+ json.dump(config_obj, input_file, indent=4)
+
+
+def config_reader(read_path):
+ """
+ Read json file given path
+
+ return: List or Dictionary
+ Reading the json file can give us a list if we have positional args or
+ key value for a dictionary
+ """
+
+ with open(read_path, 'r') as input_file:
+ conf_file = json.load(input_file)
+
+ return conf_file
+
+
+def create_dir(directory):
+ """
+ Create directory given path if the directory does not exist already
+
+ directory: String
+ Input folder path
+ """
+
+ if not os.path.exists(directory):
+ os.makedirs(directory)
+
+
+def get_existence(path, action_mode):
+ """
+ Check SUCCESS file is present in the input path
+
+ path: String
+ Input folder path
+
+ action_mode : String
+ Type of action data-gen, train ...
+
+ return: Boolean check if the file _SUCCESS exists
+ """
+
+ if action_mode == 'data-gen':
+ full_path = join(path, '_SUCCESS')
+ exist = os.path.isfile(full_path)
+ else:
+ # Files does not exist for other modes return False to continue
+ # For e.g some predict algorithms do not generate an output folder
+ # hence checking for SUCCESS would fail
+ exist = False
+
+ return exist
+
+
+def exec_dml_and_parse_time(exec_type, file_name, args, Time=True):
+ """
+ This function is responsible of execution of input arguments via python sub process,
+ We also extract time obtained from the output of this subprocess
+
+ exec_type: String
+ Contains the execution type singlenode / hybrid_spark
+
+ file_name: String
+ DML file name to be used while processing the arguments give
+
+ args: Dictionary
+ Key values pairs depending on the arg type
+
+ time: Boolean (default=True)
+ Boolean argument used to extract time from raw output logs.
+ """
+
+ algorithm = file_name + '.dml'
+ if exec_type == 'singlenode':
+ exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-standalone.py')
+
+ args = ''.join(['{} {}'.format(k, v) for k, v in args.items()])
+ cmd = [exec_script, algorithm, args]
+ cmd_string = ' '.join(cmd)
+
+ if exec_type == 'hybrid_spark':
+ exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-spark-submit.py')
+ args = ''.join(['{} {}'.format(k, v) for k, v in args.items()])
+ cmd = [exec_script, '-f', algorithm, args]
+ cmd_string = ' '.join(cmd)
+
+ # Debug
+ # print(cmd_string)
+
+ # Subprocess to execute input arguments
+ # proc1_log contains the shell output which is used for time parsing
+ proc1 = subprocess.Popen(shlex.split(cmd_string), stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+
+ if Time:
+ proc1_log = []
+ while proc1.poll() is None:
+ raw_std_out = proc1.stdout.readline()
+ decode_raw = raw_std_out.decode('ascii').strip()
+ proc1_log.append(decode_raw)
+ logging.log(10, decode_raw)
+
+ out1, err1 = proc1.communicate()
+
+ if "Error" in str(err1):
+ print('Error Found in {}'.format(file_name))
+ total_time = 'failure'
+ else:
+ total_time = parse_time(proc1_log)
+
+ else:
+ total_time = 'not_specified'
+
+ return total_time
+
+
+def parse_time(raw_logs):
+ """
+ Parses raw input list and extracts time
+
+ raw_logs : List
+ Each line obtained from the standard output is in the list
+
+ return: String
+ Extracted time in seconds or time_not_found
+ """
+ # Debug
+ # print(raw_logs)
+
+ for line in raw_logs:
+ if line.startswith('Total execution time'):
+ extract_time = re.findall(r'\d+', line)
+ total_time = '.'.join(extract_time)
+
+ return total_time
+
+ return 'time_not_found'
+
+
+def exec_test_data(exec_type, path):
+ """
+ Creates the test data split from the given input path
+
+ exec_type : String
+ Contains the execution type singlenode / hybrid_spark
+
+ path : String
+ Location of the input folder to pick X and Y
+ """
+ systemml_home = os.environ.get('SYSTEMML_HOME')
+ test_split_script = join(systemml_home, 'scripts', 'perftest', 'extractTestData')
+ X = join(path, 'X.data')
+ Y = join(path, 'Y.data')
+ X_test = join(path, 'X_test.data')
+ Y_test = join(path, 'Y_test.data')
+ args = {'-args': ' '.join([X, Y, X_test, Y_test, 'csv'])}
+
+ # Call the exec script without time
+ exec_dml_and_parse_time(exec_type, test_split_script, args, False)
+
+
+def check_predict(current_algo, ML_PREDICT):
+ """
+ To check if the current algorithm requires to run the predict
+
+ current_algo: String
+ Algorithm being processed
+
+ ML_PREDICT: Dictionary
+ Key value pairs of algorithm and predict file to process
+ """
+ if current_algo in ML_PREDICT.keys():
+ return True
+ else:
+ return False
+
+
+def get_folder_metrics(folder_name, action_mode):
+ """
+ Gets metrics from folder name
+
+ folder_name: String
+ Folder from which we want to grab details
+
+ return: List(3)
+ A list with mat_type, mat_shape, intercept
+ """
+
+ if action_mode == 'data-gen':
+ split_name = folder_name.split('.')
+ mat_type = split_name[1]
+ mat_shape = split_name[2]
+ intercept = 'none'
+
+ try:
+ if action_mode == 'train':
+ split_name = folder_name.split('.')
+ mat_type = split_name[3]
+ mat_shape = split_name[2]
+ intercept = split_name[4]
+
+ if action_mode == 'predict':
+ split_name = folder_name.split('.')
+ mat_type = split_name[3]
+ mat_shape = split_name[2]
+ intercept = split_name[4]
+ except IndexError:
+ intercept = 'none'
+
+ return mat_type, mat_shape, intercept
\ No newline at end of file