You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/07/13 21:29:20 UTC
systemml git commit: [MINOR] Performance test bug fixes
Repository: systemml
Updated Branches:
refs/heads/master f046051d4 -> cd1ae5b42
[MINOR] Performance test bug fixes
Closes #565
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cd1ae5b4
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cd1ae5b4
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cd1ae5b4
Branch: refs/heads/master
Commit: cd1ae5b42499b3b97731de8b28a6d1db9cc9e7f3
Parents: f046051
Author: krishnakalyan3 <kr...@gmail.com>
Authored: Thu Jul 13 14:28:56 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Thu Jul 13 14:28:56 2017 -0700
----------------------------------------------------------------------
scripts/perftest/python/datagen.py | 27 ++++---
scripts/perftest/python/predict.py | 48 ++++++------
scripts/perftest/python/run_perftest.py | 53 ++++++++-----
scripts/perftest/python/train.py | 40 +++++-----
scripts/perftest/python/utils.py | 112 +++++++++++++++++++++++----
5 files changed, 192 insertions(+), 88 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py
index d9c49e9..88a71f0 100755
--- a/scripts/perftest/python/datagen.py
+++ b/scripts/perftest/python/datagen.py
@@ -22,7 +22,7 @@
import itertools
from os.path import join
-from utils import split_rowcol, config_writer
+from utils import split_rowcol, config_writer, mat_type_check
# This file contains configuration settings for data generation
DATA_FORMAT = 'csv'
@@ -181,8 +181,8 @@ def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
NC = int(int(col)/2)
config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, TYPES=TYPES, SETSIZE=SETSIZE,
- LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2,
- fmt=DATA_FORMAT)
+ LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1,
+ INDEX2=INDEX2, fmt=DATA_FORMAT)
config_writer(full_path + '.json', config)
@@ -207,7 +207,7 @@ def stats2_datagen(matrix_dim, matrix_type, datagen_dir):
return full_path
-def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir):
+def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos):
"""
This function has two responsibilities. Generate the configuration files for
datagen algorithms and return a dictionary that will be used for execution.
@@ -217,11 +217,17 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir)
family type.
matrix_type: String
- Type of matrix to generate e.g dense or sparse
+ Type of matrix to generate e.g dense, sparse, all
matrix_shape: String
Shape of matrix to generate e.g 100k_10
+ datagen_dir: String
+ Path of the data generation directory
+
+ dense_algos: List
+ Algorithms that support only dense matrix type
+
return: Dictionary {string: list}
This dictionary contains algorithms to be executed as keys and the path of configuration
json files to be executed list of values.
@@ -233,13 +239,10 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir)
# Cross Product of all configurations
for current_family in distinct_families:
- if current_family in FAMILY_NO_MATRIX_TYPE:
- config = list(itertools.product(matrix_shape, ['dense']))
- config_bundle[current_family] = config
- else:
- config = list(itertools.product(matrix_shape, matrix_type))
- # clustering : [[10k_1, dense], [10k_2, dense], ...]
- config_bundle[current_family] = config
+ current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos)
+ config = list(itertools.product(matrix_shape, current_matrix_type))
+ # clustering : [[10k_1, dense], [10k_2, dense], ...]
+ config_bundle[current_family] = config
config_packets = {}
for current_family, configs in config_bundle.items():
http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py
index bc034da..92d3af4 100755
--- a/scripts/perftest/python/predict.py
+++ b/scripts/perftest/python/predict.py
@@ -21,10 +21,8 @@
#-------------------------------------------------------------
import sys
-import os
from os.path import join
-import glob
-from utils import create_dir, config_writer
+from utils import config_writer, relevant_folders, mat_type_check
# Contains configuration setting for predicting
DATA_FORMAT = 'csv'
@@ -221,7 +219,7 @@ def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir):
return full_path_predict
-def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
+def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos):
"""
This function has two responsibilities. Generate the configuration files for
prediction algorithms and return a dictionary that will be used for execution.
@@ -230,6 +228,12 @@ def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
The first tuple index contains algorithm name and the second index contains
family type.
+ matrix_type: String
+ Type of matrix to generate e.g dense, sparse, all
+
+ matrix_shape: String
+ Shape of matrix to generate e.g 100k_10
+
datagen_dir: String
Path of the data generation directory
@@ -239,45 +243,39 @@ def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
predict_dir: String
Path of the prediction directory
+ dense_algos: List
+ Algorithms that support only dense matrix type
+
return: Dictionary {string: list}
This dictionary contains algorithms to be executed as keys and the path of configuration
json files to be executed list of values.
"""
-
- algo_payload_distinct = set(map(lambda x: x[0], algo_payload))
-
config_bundle = {}
- for k, v in algo_payload:
+ for k, _ in algo_payload:
config_bundle[k] = []
- for current_algo in algo_payload_distinct:
- # Get all train folders related to the algorithm
- train_path = join(train_dir, current_algo)
- train_subdir = glob.glob(train_path + "*")
- train_folders = list(filter(lambda x: os.path.isdir(x), train_subdir))
+ for current_algo, current_family in algo_payload:
+ current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos)
+ train_folders = relevant_folders(train_dir, current_algo, current_family,
+ current_matrix_type, matrix_shape, 'train')
if len(train_folders) == 0:
print('training folders not present for {}'.format(current_algo))
sys.exit()
for current_train_folder in train_folders:
- save_name = current_train_folder.split('/')[-1]
- # Get all datagen folders
- data_gen_folder_name = '.'.join(save_name.split('.')[1:-1])
- data_gen_path = join(datagen_dir, data_gen_folder_name)
- data_gen_subdir = glob.glob(data_gen_path + "*")
- data_gen_folder = list(filter(lambda x: os.path.isdir(x), data_gen_subdir))
-
- if len(data_gen_folder) == 0:
+ current_data_gen_dir = relevant_folders(datagen_dir, current_algo, current_family,
+ current_matrix_type, matrix_shape, 'data-gen')
+ if len(current_data_gen_dir) == 0:
print('data-gen folders not present for {}'.format(current_family))
sys.exit()
- # Ideally we will have more than one datagen directory to be found
- current_data_gen_dir = list(data_gen_folder)[0]
-
+ save_name = current_train_folder.split('/')[-1]
algo_func = '_'.join([current_algo.lower().replace('-', '_')] + ['predict'])
- conf_path = globals()[algo_func](save_name, current_data_gen_dir,
+
+ # current_data_gen_dir has index 0 as we would expect one datagen for each algorithm
+ conf_path = globals()[algo_func](save_name, current_data_gen_dir[0],
current_train_folder, predict_dir)
config_bundle[current_algo].append(conf_path)
http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py
index b0257d4..3360285 100755
--- a/scripts/perftest/python/run_perftest.py
+++ b/scripts/perftest/python/run_perftest.py
@@ -26,13 +26,14 @@ import argparse
from functools import reduce
import os
from os.path import join
-from utils import get_families, config_reader, create_dir, get_existence, \
- exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
import logging
from datetime import datetime
from datagen import config_packets_datagen
from train import config_packets_train
from predict import config_packets_predict
+from utils import get_families, config_reader, create_dir, get_existence, \
+ exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
+
# A packet is a dictionary
# with key as the algorithm
@@ -80,6 +81,8 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict',
'GLM_gamma': 'GLM-predict',
'GLM_binomial': 'GLM-predict'}
+DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2']
+
# Responsible for execution and metric logging
def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode):
@@ -125,7 +128,7 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode)
print('data already exists {}'.format(config_path))
time = 'data_exists'
else:
- time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args)
+ time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args)
# Write a _SUCCESS file only if time is found and in data-gen action_mode
if len(time.split('.')) == 2 and action_mode == 'data-gen':
@@ -152,7 +155,7 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
Contains the execution type singlenode / hybrid_spark
mat_type: List
- Type of matrix to generate dense or sparse
+ Type of matrix to generate dense, sparse, all
mat_shape: List
Dimensions of the input matrix with rows and columns
@@ -201,12 +204,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
if 'data-gen' in mode:
data_gen_dir = join(temp_dir, 'data-gen')
create_dir(data_gen_dir)
- conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir)
+ conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir,
+ DENSE_TYPE_ALGOS)
for family_name, config_folders in conf_packet.items():
for config in config_folders:
file_name = ML_GENDATA[family_name]
algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen')
-
# Statistic family do not require to be split
if family_name not in ['stats1', 'stats2']:
exec_test_data(exec_type, config)
@@ -215,7 +218,8 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
data_gen_dir = join(temp_dir, 'data-gen')
train_dir = join(temp_dir, 'train')
create_dir(train_dir)
- conf_packet = config_packets_train(algos_to_run, data_gen_dir, train_dir)
+ conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir,
+ train_dir, DENSE_TYPE_ALGOS)
for algo_name, config_files in conf_packet.items():
for config in config_files:
file_name = ML_TRAIN[algo_name]
@@ -227,9 +231,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
predict_dir = join(temp_dir, 'predict')
create_dir(predict_dir)
algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run))
- if len(algos_to_run_perdict) < 0:
+ if len(algos_to_run_perdict) < 1:
+ # No algorithms with predict found
pass
- conf_packet = config_packets_predict(algos_to_run_perdict, data_gen_dir, train_dir, predict_dir)
+ conf_packet = config_packets_predict(algos_to_run_perdict, mat_type, mat_shape, data_gen_dir,
+ train_dir, predict_dir, DENSE_TYPE_ALGOS)
+
for algo_name, config_files in conf_packet.items():
for config in config_files:
file_name = ML_PREDICT[algo_name]
@@ -243,11 +250,12 @@ if __name__ == '__main__':
print('SYSTEMML_HOME not found')
sys.exit()
+ # Supported Arguments
+ mat_type = ['dense', 'sparse', 'all']
+ workload = ['data-gen', 'train', 'predict']
+ execution_mode = ['hybrid_spark', 'singlenode']
# Default Arguments
- default_mat_type = ['dense', 'sparse']
- default_workload = ['data-gen', 'train', 'predict']
default_mat_shape = ['10k_100']
- default_execution_mode = ['hybrid_spark', 'singlenode']
# Default temp directory, contains everything generated in perftest
default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
@@ -274,21 +282,21 @@ if __name__ == '__main__':
'(Overrides --family, available : ' + ', '.join(sorted(all_algos)) + ')', metavar='',
choices=all_algos, nargs='+')
- cparser.add_argument('--exec-type', default='singlenode', help='System-ML backend '
- '(available : singlenode, spark-hybrid)', metavar='',
- choices=default_execution_mode)
- cparser.add_argument('--mat-type', default=default_mat_type, help='space separated list of types of matrix to generate '
- '(available : dense, sparse)', metavar='', choices=default_mat_type,
+ cparser.add_argument('--exec-type', default='hybrid_spark', help='System-ML backend '
+ 'available : ' + ','.join(execution_mode), metavar='',
+ choices=execution_mode)
+ cparser.add_argument('--mat-type', default=['all'], help='space separated list of types of matrix to generate '
+ 'available : ' + ','.join(mat_type), metavar='', choices=mat_type,
nargs='+')
cparser.add_argument('--mat-shape', default=default_mat_shape, help='space separated list of shapes of matrices '
'to generate (e.g 10k_1k, 20M_4k)', metavar='', nargs='+')
cparser.add_argument('--temp-dir', default=default_temp_dir, help='temporary directory '
- 'where generated, training and prediction data is put', metavar='')
+ 'where generated, training and prediction data is put', metavar='')
cparser.add_argument('--filename', default='perf_test', help='name of the output file for the perf'
' metrics', metavar='')
- cparser.add_argument('--mode', default=default_workload,
+ cparser.add_argument('--mode', default=workload,
help='space separated list of types of workloads to run (available: data-gen, train, predict)',
- metavar='', choices=default_workload, nargs='+')
+ metavar='', choices=workload, nargs='+')
# Args is a namespace
args = cparser.parse_args()
@@ -297,6 +305,11 @@ if __name__ == '__main__':
# Debug arguments
# print(arg_dict)
+ # default_mat_type validity
+ if len(args.mat_type) > 2:
+ print('length of --mat-type argument cannot be greater than two')
+ sys.exit()
+
# Check for validity of input arguments
if args.family is not None:
for fam in args.family:
http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/train.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py
index 1ab2880..627ba03 100755
--- a/scripts/perftest/python/train.py
+++ b/scripts/perftest/python/train.py
@@ -21,10 +21,8 @@
#-------------------------------------------------------------
import sys
-import glob
-import os
from os.path import join
-from utils import config_writer
+from utils import config_writer, relevant_folders, mat_type_check
from functools import reduce
# Contains configuration setting for training
@@ -48,8 +46,8 @@ def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
model = join(full_path_train, 'model.data')
Log = join(full_path_train, 'Log.data')
- config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter, model=model,
- Log=Log, fmt=DATA_FORMAT)
+ config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter,
+ model=model, Log=Log, fmt=DATA_FORMAT)
config_writer(full_path_train + '.json', config)
return data_folders
@@ -117,8 +115,8 @@ def multinomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
model = join(full_path_train, 'model.data')
Log = join(full_path_train, 'Log.data')
- config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter, model=model,
- Log=Log, fmt=DATA_FORMAT)
+ config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter,
+ model=model, Log=Log, fmt=DATA_FORMAT)
config_writer(full_path_train + '.json', config)
data_folders.append(full_path_train)
@@ -358,7 +356,7 @@ def regression2_glm_poisson_train(save_folder_name, datagen_dir, train_dir):
return data_folders
-def config_packets_train(algo_payload, datagen_dir, train_dir):
+def config_packets_train(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, dense_algos):
"""
This function has two responsibilities. Generate the configuration files for
input training algorithms and return a dictionary that will be used for execution.
@@ -367,39 +365,45 @@ def config_packets_train(algo_payload, datagen_dir, train_dir):
The first tuple index contains algorithm name and the second index contains
family type.
+ matrix_type: String
+ Type of matrix to generate e.g dense, sparse, all
+
+ matrix_shape: String
+ Shape of matrix to generate e.g 100k_10
+
datagen_dir: String
Path of the data generation directory
train_dir: String
Path of the training directory
+ dense_algos: List
+ Algorithms that support only dense matrix type
+
return: {string: list}
This dictionary contains algorithms to be executed as keys and the path of configuration
json files to be executed list of values.
-
"""
config_bundle = {}
- for k, v in algo_payload:
+ for k, _ in algo_payload:
config_bundle[k] = []
for current_algo, current_family in algo_payload:
- data_gen_path = join(datagen_dir, current_family)
- data_gen_subdir = glob.glob(data_gen_path + "*")
-
- # Filter for specific data gen
- data_gen_folders = list(filter(lambda x: os.path.isdir(x), data_gen_subdir))
+ current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos)
+ data_gen_folders = relevant_folders(datagen_dir, current_algo, current_family,
+ current_matrix_type, matrix_shape, 'data-gen')
if len(data_gen_folders) == 0:
print('datagen folders not present for {}'.format(current_family))
sys.exit()
- for current_folder in data_gen_folders:
- file_path_last = current_folder.split('/')[-1]
+ for current_datagen_dir in data_gen_folders:
+ file_path_last = current_datagen_dir.split('/')[-1]
save_name = '.'.join([current_algo] + [file_path_last])
algo_func = '_'.join([current_family] + [current_algo.lower().replace('-', '_')]
+ ['train'])
- conf_path = globals()[algo_func](save_name, current_folder, train_dir)
+ conf_path = globals()[algo_func](save_name, current_datagen_dir, train_dir)
config_bundle[current_algo].append(conf_path)
config_packets = {}
http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/utils.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py
index 464d7f6..4bba34f 100755
--- a/scripts/perftest/python/utils.py
+++ b/scripts/perftest/python/utils.py
@@ -27,11 +27,14 @@ import subprocess
import shlex
import re
import logging
+import sys
+import glob
+from functools import reduce
# This file contains all the utility functions required for performance test module
-def get_families(current_algo, ML_ALGO):
+def get_families(current_algo, ml_algo):
"""
Given current algorithm we get its families.
@@ -46,7 +49,7 @@ def get_families(current_algo, ML_ALGO):
"""
family_list = []
- for family, algos in ML_ALGO.items():
+ for family, algos in ml_algo.items():
if current_algo in algos:
family_list.append(family)
return family_list
@@ -138,7 +141,7 @@ def get_existence(path, action_mode):
return exist
-def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, args, Time=True):
+def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, args, time=True):
"""
This function is responsible of execution of input arguments via python sub process,
We also extract time obtained from the output of this subprocess
@@ -181,7 +184,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg
proc1 = subprocess.Popen(shlex.split(cmd_string), stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
- if Time:
+ if time:
proc1_log = []
while proc1.poll() is None:
raw_std_out = proc1.stdout.readline()
@@ -189,7 +192,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg
proc1_log.append(decode_raw)
logging.log(10, decode_raw)
- out1, err1 = proc1.communicate()
+ _, err1 = proc1.communicate()
if "Error" in str(err1):
print('Error Found in {}'.format(dml_file_name))
@@ -197,9 +200,9 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg
else:
total_time = parse_time(proc1_log)
- with open(execution_output_file, 'w') as f:
+ with open(execution_output_file, 'w') as file:
for row in proc1_log:
- f.write("%s\n" % str(row))
+ file.write("%s\n" % str(row))
else:
total_time = 'not_specified'
@@ -253,20 +256,18 @@ def exec_test_data(exec_type, path):
exec_dml_and_parse_time(exec_type, test_split_script, config_file_name, args, False)
-def check_predict(current_algo, ML_PREDICT):
+def check_predict(current_algo, ml_predict):
"""
To check if the current algorithm requires to run the predict
current_algo: String
Algorithm being processed
- ML_PREDICT: Dictionary
+ ml_predict: Dictionary
Key value pairs of algorithm and predict file to process
"""
- if current_algo in ML_PREDICT.keys():
+ if current_algo in ml_predict.keys():
return True
- else:
- return False
def get_folder_metrics(folder_name, action_mode):
@@ -301,4 +302,89 @@ def get_folder_metrics(folder_name, action_mode):
except IndexError:
intercept = 'none'
- return mat_type, mat_shape, intercept
\ No newline at end of file
+ return mat_type, mat_shape, intercept
+
+
+def mat_type_check(current_family, matrix_types, dense_algos):
+ """
+ Some Algorithms support different matrix_type. This function give us the right matrix_type given
+ an algorithm
+
+ current_family: String
+ Current family being porcessed in this function
+
+ matrix_type: List
+ Type of matrix to generate dense, sparse, all
+
+ dense_algos: List
+ Algorithms that support only dense matrix type
+
+ return: List
+ Return the list of right matrix types supported by the family
+ """
+ current_type = []
+ for current_matrix_type in matrix_types:
+ if current_matrix_type == 'all':
+ if current_family in dense_algos:
+ current_type.append('dense')
+ else:
+ current_type.append('dense')
+ current_type.append('sparse')
+
+ if current_matrix_type == 'sparse':
+ if current_family in dense_algos:
+ sys.exit('{} does not support {} matrix type'.format(current_family,
+ current_matrix_type))
+ else:
+ current_type.append(current_matrix_type)
+
+ if current_matrix_type == 'dense':
+ current_type.append(current_matrix_type)
+
+ return current_type
+
+
+def relevant_folders(path, algo, family, matrix_type, matrix_shape, mode):
+ """
+ Finds the right folder to read the data based on given parameters
+
+ path: String
+ Location of data-gen and training folders
+
+ algo: String
+ Current algorithm being processed by this function
+
+ family: String
+ Current family being processed by this function
+
+ matrix_type: List
+ Type of matrix to generate dense, sparse, all
+
+ matrix_shape: List
+ Dimensions of the input matrix with rows and columns
+
+ mode: String
+ Based on mode and arguments we read the specific folders e.g data-gen folder or train folder
+
+ return: List
+ List of folder locations to read data from
+ """
+ folders = []
+ for current_matrix_type in matrix_type:
+ for current_matrix_shape in matrix_shape:
+ if mode == 'data-gen':
+ data_gen_path = join(path, family)
+ sub_folder_name = '.'.join([current_matrix_type, current_matrix_shape])
+ path_subdir = glob.glob(data_gen_path + '.' + sub_folder_name + "*")
+
+ if mode == 'train':
+ train_path = join(path, algo)
+ sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape])
+ path_subdir = glob.glob(train_path + '.' + sub_folder_name + "*")
+
+ path_folders = list(filter(lambda x: os.path.isdir(x), path_subdir))
+ folders.append(path_folders)
+
+ folders_flat = reduce(lambda x, y: x + y, folders)
+
+ return folders_flat