You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by na...@apache.org on 2017/07/13 21:29:20 UTC

systemml git commit: [MINOR] Performance test bug fixes

Repository: systemml
Updated Branches:
  refs/heads/master f046051d4 -> cd1ae5b42


[MINOR] Performance test bug fixes

Closes #565


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/cd1ae5b4
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/cd1ae5b4
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/cd1ae5b4

Branch: refs/heads/master
Commit: cd1ae5b42499b3b97731de8b28a6d1db9cc9e7f3
Parents: f046051
Author: krishnakalyan3 <kr...@gmail.com>
Authored: Thu Jul 13 14:28:56 2017 -0700
Committer: Nakul Jindal <na...@gmail.com>
Committed: Thu Jul 13 14:28:56 2017 -0700

----------------------------------------------------------------------
 scripts/perftest/python/datagen.py      |  27 ++++---
 scripts/perftest/python/predict.py      |  48 ++++++------
 scripts/perftest/python/run_perftest.py |  53 ++++++++-----
 scripts/perftest/python/train.py        |  40 +++++-----
 scripts/perftest/python/utils.py        | 112 +++++++++++++++++++++++----
 5 files changed, 192 insertions(+), 88 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py
index d9c49e9..88a71f0 100755
--- a/scripts/perftest/python/datagen.py
+++ b/scripts/perftest/python/datagen.py
@@ -22,7 +22,7 @@
 
 import itertools
 from os.path import join
-from utils import split_rowcol, config_writer
+from utils import split_rowcol, config_writer, mat_type_check
 
 # This file contains configuration settings for data generation
 DATA_FORMAT = 'csv'
@@ -181,8 +181,8 @@ def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
     NC = int(int(col)/2)
 
     config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, TYPES=TYPES, SETSIZE=SETSIZE,
-                  LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2,
-                  fmt=DATA_FORMAT)
+                  LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1,
+                  INDEX2=INDEX2, fmt=DATA_FORMAT)
 
     config_writer(full_path + '.json', config)
 
@@ -207,7 +207,7 @@ def stats2_datagen(matrix_dim, matrix_type, datagen_dir):
     return full_path
 
 
-def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir):
+def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos):
     """
     This function has two responsibilities. Generate the configuration files for
     datagen algorithms and return a dictionary that will be used for execution.
@@ -217,11 +217,17 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir)
     family type.
 
     matrix_type: String
-    Type of matrix to generate e.g dense or sparse
+    Type of matrix to generate e.g dense, sparse, all
 
     matrix_shape: String
     Shape of matrix to generate e.g 100k_10
 
+    datagen_dir: String
+    Path of the data generation directory
+
+    dense_algos: List
+    Algorithms that support only dense matrix type
+
     return: Dictionary {string: list}
     This dictionary contains algorithms to be executed as keys and the path of configuration
     json files to be executed list of values.
@@ -233,13 +239,10 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir)
 
     # Cross Product of all configurations
     for current_family in distinct_families:
-        if current_family in FAMILY_NO_MATRIX_TYPE:
-            config = list(itertools.product(matrix_shape, ['dense']))
-            config_bundle[current_family] = config
-        else:
-            config = list(itertools.product(matrix_shape, matrix_type))
-            # clustering : [[10k_1, dense], [10k_2, dense], ...]
-            config_bundle[current_family] = config
+        current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos)
+        config = list(itertools.product(matrix_shape, current_matrix_type))
+        # clustering : [[10k_1, dense], [10k_2, dense], ...]
+        config_bundle[current_family] = config
 
     config_packets = {}
     for current_family, configs in config_bundle.items():

http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py
index bc034da..92d3af4 100755
--- a/scripts/perftest/python/predict.py
+++ b/scripts/perftest/python/predict.py
@@ -21,10 +21,8 @@
 #-------------------------------------------------------------
 
 import sys
-import os
 from os.path import join
-import glob
-from utils import create_dir, config_writer
+from utils import config_writer, relevant_folders, mat_type_check
 
 # Contains configuration setting for predicting
 DATA_FORMAT = 'csv'
@@ -221,7 +219,7 @@ def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir):
     return full_path_predict
 
 
-def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
+def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos):
     """
     This function has two responsibilities. Generate the configuration files for
     prediction algorithms and return a dictionary that will be used for execution.
@@ -230,6 +228,12 @@ def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
     The first tuple index contains algorithm name and the second index contains
     family type.
 
+    matrix_type: String
+    Type of matrix to generate e.g dense, sparse, all
+
+    matrix_shape: String
+    Shape of matrix to generate e.g 100k_10
+
     datagen_dir: String
     Path of the data generation directory
 
@@ -239,45 +243,39 @@ def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
     predict_dir: String
     Path of the prediction directory
 
+    dense_algos: List
+    Algorithms that support only dense matrix type
+
     return: Dictionary  {string: list}
     This dictionary contains algorithms to be executed as keys and the path of configuration
     json files to be executed list of values.
     """
-
-    algo_payload_distinct = set(map(lambda x: x[0], algo_payload))
-
     config_bundle = {}
 
-    for k, v in algo_payload:
+    for k, _ in algo_payload:
         config_bundle[k] = []
 
-    for current_algo in algo_payload_distinct:
-        # Get all train folders related to the algorithm
-        train_path = join(train_dir, current_algo)
-        train_subdir = glob.glob(train_path + "*")
-        train_folders = list(filter(lambda x: os.path.isdir(x), train_subdir))
+    for current_algo, current_family in algo_payload:
+        current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos)
+        train_folders = relevant_folders(train_dir, current_algo, current_family,
+                                         current_matrix_type, matrix_shape, 'train')
 
         if len(train_folders) == 0:
             print('training folders not present for {}'.format(current_algo))
             sys.exit()
 
         for current_train_folder in train_folders:
-            save_name = current_train_folder.split('/')[-1]
-            # Get all datagen folders
-            data_gen_folder_name = '.'.join(save_name.split('.')[1:-1])
-            data_gen_path = join(datagen_dir, data_gen_folder_name)
-            data_gen_subdir = glob.glob(data_gen_path + "*")
-            data_gen_folder = list(filter(lambda x: os.path.isdir(x), data_gen_subdir))
-
-            if len(data_gen_folder) == 0:
+            current_data_gen_dir = relevant_folders(datagen_dir, current_algo, current_family,
+                                                    current_matrix_type, matrix_shape, 'data-gen')
+            if len(current_data_gen_dir) == 0:
                 print('data-gen folders not present for {}'.format(current_family))
                 sys.exit()
 
-            # Ideally we will have more than one datagen directory to be found
-            current_data_gen_dir = list(data_gen_folder)[0]
-
+            save_name = current_train_folder.split('/')[-1]
             algo_func = '_'.join([current_algo.lower().replace('-', '_')] + ['predict'])
-            conf_path = globals()[algo_func](save_name, current_data_gen_dir,
+
+            # current_data_gen_dir has index 0 as we would expect one datagen for each algorithm
+            conf_path = globals()[algo_func](save_name, current_data_gen_dir[0],
                                              current_train_folder, predict_dir)
 
             config_bundle[current_algo].append(conf_path)

http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py
index b0257d4..3360285 100755
--- a/scripts/perftest/python/run_perftest.py
+++ b/scripts/perftest/python/run_perftest.py
@@ -26,13 +26,14 @@ import argparse
 from functools import reduce
 import os
 from os.path import join
-from utils import get_families, config_reader, create_dir,  get_existence, \
-    exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
 import logging
 from datetime import datetime
 from datagen import config_packets_datagen
 from train import config_packets_train
 from predict import config_packets_predict
+from utils import get_families, config_reader, create_dir, get_existence, \
+    exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
+
 
 # A packet is a dictionary
 # with key as the algorithm
@@ -80,6 +81,8 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict',
               'GLM_gamma': 'GLM-predict',
               'GLM_binomial': 'GLM-predict'}
 
+DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2']
+
 
 # Responsible for execution and metric logging
 def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode):
@@ -125,7 +128,7 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode)
         print('data already exists {}'.format(config_path))
         time = 'data_exists'
     else:
-        time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name,  args)
+        time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args)
 
     # Write a _SUCCESS file only if time is found and in data-gen action_mode
     if len(time.split('.')) == 2 and action_mode == 'data-gen':
@@ -152,7 +155,7 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
     Contains the execution type singlenode / hybrid_spark
 
     mat_type: List
-    Type of matrix to generate dense or sparse
+    Type of matrix to generate dense, sparse, all
 
     mat_shape: List
     Dimensions of the input matrix with rows and columns
@@ -201,12 +204,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
     if 'data-gen' in mode:
         data_gen_dir = join(temp_dir, 'data-gen')
         create_dir(data_gen_dir)
-        conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir)
+        conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir,
+                                             DENSE_TYPE_ALGOS)
         for family_name, config_folders in conf_packet.items():
             for config in config_folders:
                 file_name = ML_GENDATA[family_name]
                 algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen')
-
                 # Statistic family do not require to be split
                 if family_name not in ['stats1', 'stats2']:
                     exec_test_data(exec_type, config)
@@ -215,7 +218,8 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
         data_gen_dir = join(temp_dir, 'data-gen')
         train_dir = join(temp_dir, 'train')
         create_dir(train_dir)
-        conf_packet = config_packets_train(algos_to_run, data_gen_dir, train_dir)
+        conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir,
+                                           train_dir, DENSE_TYPE_ALGOS)
         for algo_name, config_files in conf_packet.items():
             for config in config_files:
                 file_name = ML_TRAIN[algo_name]
@@ -227,9 +231,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode
         predict_dir = join(temp_dir, 'predict')
         create_dir(predict_dir)
         algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run))
-        if len(algos_to_run_perdict) < 0:
+        if len(algos_to_run_perdict) < 1:
+            # No algorithms with predict found
             pass
-        conf_packet = config_packets_predict(algos_to_run_perdict, data_gen_dir, train_dir, predict_dir)
+        conf_packet = config_packets_predict(algos_to_run_perdict, mat_type, mat_shape, data_gen_dir,
+                                             train_dir, predict_dir, DENSE_TYPE_ALGOS)
+
         for algo_name, config_files in conf_packet.items():
                 for config in config_files:
                     file_name = ML_PREDICT[algo_name]
@@ -243,11 +250,12 @@ if __name__ == '__main__':
         print('SYSTEMML_HOME not found')
         sys.exit()
 
+    # Supported Arguments
+    mat_type = ['dense', 'sparse', 'all']
+    workload = ['data-gen', 'train', 'predict']
+    execution_mode = ['hybrid_spark', 'singlenode']
     # Default Arguments
-    default_mat_type = ['dense', 'sparse']
-    default_workload = ['data-gen', 'train', 'predict']
     default_mat_shape = ['10k_100']
-    default_execution_mode = ['hybrid_spark', 'singlenode']
 
     # Default temp directory, contains everything generated in perftest
     default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
@@ -274,21 +282,21 @@ if __name__ == '__main__':
                          '(Overrides --family, available : ' + ', '.join(sorted(all_algos)) + ')', metavar='',
                          choices=all_algos, nargs='+')
 
-    cparser.add_argument('--exec-type', default='singlenode', help='System-ML backend '
-                         '(available : singlenode, spark-hybrid)', metavar='',
-                         choices=default_execution_mode)
-    cparser.add_argument('--mat-type', default=default_mat_type, help='space separated list of types of matrix to generate '
-                         '(available : dense, sparse)', metavar='', choices=default_mat_type,
+    cparser.add_argument('--exec-type', default='hybrid_spark', help='System-ML backend '
+                         'available : ' + ','.join(execution_mode), metavar='',
+                         choices=execution_mode)
+    cparser.add_argument('--mat-type', default=['all'], help='space separated list of types of matrix to generate '
+                         'available : ' + ','.join(mat_type), metavar='', choices=mat_type,
                          nargs='+')
     cparser.add_argument('--mat-shape', default=default_mat_shape, help='space separated list of shapes of matrices '
                          'to generate (e.g 10k_1k, 20M_4k)', metavar='', nargs='+')
     cparser.add_argument('--temp-dir', default=default_temp_dir, help='temporary directory '
-                        'where generated, training and prediction data is put', metavar='')
+                         'where generated, training and prediction data is put', metavar='')
     cparser.add_argument('--filename', default='perf_test', help='name of the output file for the perf'
                          ' metrics', metavar='')
-    cparser.add_argument('--mode', default=default_workload,
+    cparser.add_argument('--mode', default=workload,
                          help='space separated list of types of workloads to run (available: data-gen, train, predict)',
-                         metavar='', choices=default_workload, nargs='+')
+                         metavar='', choices=workload, nargs='+')
 
     # Args is a namespace
     args = cparser.parse_args()
@@ -297,6 +305,11 @@ if __name__ == '__main__':
     # Debug arguments
     # print(arg_dict)
 
+    # default_mat_type validity
+    if len(args.mat_type) > 2:
+        print('length of --mat-type argument cannot be greater than two')
+        sys.exit()
+
     # Check for validity of input arguments
     if args.family is not None:
         for fam in args.family:

http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/train.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py
index 1ab2880..627ba03 100755
--- a/scripts/perftest/python/train.py
+++ b/scripts/perftest/python/train.py
@@ -21,10 +21,8 @@
 #-------------------------------------------------------------
 
 import sys
-import glob
-import os
 from os.path import join
-from utils import config_writer
+from utils import config_writer, relevant_folders, mat_type_check
 from functools import reduce
 
 # Contains configuration setting for training
@@ -48,8 +46,8 @@ def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
         model = join(full_path_train, 'model.data')
         Log = join(full_path_train, 'Log.data')
 
-        config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter, model=model,
-                      Log=Log, fmt=DATA_FORMAT)
+        config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, maxiter=maxiter,
+                      model=model, Log=Log, fmt=DATA_FORMAT)
         config_writer(full_path_train + '.json', config)
 
     return data_folders
@@ -117,8 +115,8 @@ def multinomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
         model = join(full_path_train, 'model.data')
         Log = join(full_path_train, 'Log.data')
 
-        config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter, model=model,
-                      Log=Log, fmt=DATA_FORMAT)
+        config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, maxiter=maxiter,
+                      model=model, Log=Log, fmt=DATA_FORMAT)
         config_writer(full_path_train + '.json', config)
         data_folders.append(full_path_train)
 
@@ -358,7 +356,7 @@ def regression2_glm_poisson_train(save_folder_name, datagen_dir, train_dir):
     return data_folders
 
 
-def config_packets_train(algo_payload, datagen_dir, train_dir):
+def config_packets_train(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, dense_algos):
     """
     This function has two responsibilities. Generate the configuration files for
     input training algorithms and return a dictionary that will be used for execution.
@@ -367,39 +365,45 @@ def config_packets_train(algo_payload, datagen_dir, train_dir):
     The first tuple index contains algorithm name and the second index contains
     family type.
 
+    matrix_type: String
+    Type of matrix to generate e.g dense, sparse, all
+
+    matrix_shape: String
+    Shape of matrix to generate e.g 100k_10
+
     datagen_dir: String
     Path of the data generation directory
 
     train_dir: String
     Path of the training directory
 
+    dense_algos: List
+    Algorithms that support only dense matrix type
+
     return: {string: list}
     This dictionary contains algorithms to be executed as keys and the path of configuration
     json files to be executed list of values.
-
     """
 
     config_bundle = {}
 
-    for k, v in algo_payload:
+    for k, _ in algo_payload:
         config_bundle[k] = []
 
     for current_algo, current_family in algo_payload:
-        data_gen_path = join(datagen_dir, current_family)
-        data_gen_subdir = glob.glob(data_gen_path + "*")
-
-        # Filter for specific data gen
-        data_gen_folders = list(filter(lambda x: os.path.isdir(x), data_gen_subdir))
+        current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos)
+        data_gen_folders = relevant_folders(datagen_dir, current_algo, current_family,
+                                            current_matrix_type, matrix_shape, 'data-gen')
         if len(data_gen_folders) == 0:
             print('datagen folders not present for {}'.format(current_family))
             sys.exit()
 
-        for current_folder in data_gen_folders:
-            file_path_last = current_folder.split('/')[-1]
+        for current_datagen_dir in data_gen_folders:
+            file_path_last = current_datagen_dir.split('/')[-1]
             save_name = '.'.join([current_algo] + [file_path_last])
             algo_func = '_'.join([current_family] + [current_algo.lower().replace('-', '_')]
                                  + ['train'])
-            conf_path = globals()[algo_func](save_name, current_folder, train_dir)
+            conf_path = globals()[algo_func](save_name, current_datagen_dir, train_dir)
             config_bundle[current_algo].append(conf_path)
 
     config_packets = {}

http://git-wip-us.apache.org/repos/asf/systemml/blob/cd1ae5b4/scripts/perftest/python/utils.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py
index 464d7f6..4bba34f 100755
--- a/scripts/perftest/python/utils.py
+++ b/scripts/perftest/python/utils.py
@@ -27,11 +27,14 @@ import subprocess
 import shlex
 import re
 import logging
+import sys
+import glob
+from functools import reduce
 
 # This file contains all the utility functions required for performance test module
 
 
-def get_families(current_algo, ML_ALGO):
+def get_families(current_algo, ml_algo):
     """
     Given current algorithm we get its families.
 
@@ -46,7 +49,7 @@ def get_families(current_algo, ML_ALGO):
     """
 
     family_list = []
-    for family, algos in ML_ALGO.items():
+    for family, algos in ml_algo.items():
         if current_algo in algos:
             family_list.append(family)
     return family_list
@@ -138,7 +141,7 @@ def get_existence(path, action_mode):
     return exist
 
 
-def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, args, Time=True):
+def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, args, time=True):
     """
     This function is responsible of execution of input arguments via python sub process,
     We also extract time obtained from the output of this subprocess
@@ -181,7 +184,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg
     proc1 = subprocess.Popen(shlex.split(cmd_string), stdout=subprocess.PIPE,
                              stderr=subprocess.PIPE)
 
-    if Time:
+    if time:
         proc1_log = []
         while proc1.poll() is None:
             raw_std_out = proc1.stdout.readline()
@@ -189,7 +192,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg
             proc1_log.append(decode_raw)
             logging.log(10, decode_raw)
 
-        out1, err1 = proc1.communicate()
+        _, err1 = proc1.communicate()
 
         if "Error" in str(err1):
             print('Error Found in {}'.format(dml_file_name))
@@ -197,9 +200,9 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, execution_output_file, arg
         else:
             total_time = parse_time(proc1_log)
 
-        with open(execution_output_file, 'w') as f:
+        with open(execution_output_file, 'w') as file:
             for row in proc1_log:
-                f.write("%s\n" % str(row))
+                file.write("%s\n" % str(row))
 
     else:
         total_time = 'not_specified'
@@ -253,20 +256,18 @@ def exec_test_data(exec_type, path):
     exec_dml_and_parse_time(exec_type, test_split_script, config_file_name, args, False)
 
 
-def check_predict(current_algo, ML_PREDICT):
+def check_predict(current_algo, ml_predict):
     """
     To check if the current algorithm requires to run the predict
 
     current_algo: String
     Algorithm being processed
 
-    ML_PREDICT: Dictionary
+    ml_predict: Dictionary
     Key value pairs of algorithm and predict file to process
     """
-    if current_algo in ML_PREDICT.keys():
+    if current_algo in ml_predict.keys():
         return True
-    else:
-        return False
 
 
 def get_folder_metrics(folder_name, action_mode):
@@ -301,4 +302,89 @@ def get_folder_metrics(folder_name, action_mode):
     except IndexError:
         intercept = 'none'
 
-    return mat_type, mat_shape, intercept
\ No newline at end of file
+    return mat_type, mat_shape, intercept
+
+
+def mat_type_check(current_family, matrix_types, dense_algos):
+    """
+    Some Algorithms support different matrix_type. This function give us the right matrix_type given
+    an algorithm
+
+    current_family: String
+    Current family being porcessed in this function
+
+    matrix_type: List
+    Type of matrix to generate dense, sparse, all
+
+    dense_algos: List
+    Algorithms that support only dense matrix type
+
+    return: List
+    Return the list of right matrix types supported by the family
+    """
+    current_type = []
+    for current_matrix_type in matrix_types:
+        if current_matrix_type == 'all':
+            if current_family in dense_algos:
+                current_type.append('dense')
+            else:
+                current_type.append('dense')
+                current_type.append('sparse')
+
+        if current_matrix_type == 'sparse':
+            if current_family in dense_algos:
+                sys.exit('{} does not support {} matrix type'.format(current_family,
+                                                                     current_matrix_type))
+            else:
+                current_type.append(current_matrix_type)
+
+        if current_matrix_type == 'dense':
+            current_type.append(current_matrix_type)
+
+    return current_type
+
+
+def relevant_folders(path, algo, family, matrix_type, matrix_shape, mode):
+    """
+    Finds the right folder to read the data based on given parameters
+
+    path: String
+    Location of data-gen and training folders
+
+    algo: String
+    Current algorithm being processed by this function
+
+    family: String
+    Current family being processed by this function
+
+    matrix_type: List
+    Type of matrix to generate dense, sparse, all
+
+    matrix_shape: List
+    Dimensions of the input matrix with rows and columns
+
+    mode: String
+    Based on mode and arguments we read the specific folders e.g data-gen folder or train folder
+
+    return: List
+    List of folder locations to read data from
+    """
+    folders = []
+    for current_matrix_type in matrix_type:
+        for current_matrix_shape in matrix_shape:
+            if mode == 'data-gen':
+                data_gen_path = join(path, family)
+                sub_folder_name = '.'.join([current_matrix_type, current_matrix_shape])
+                path_subdir = glob.glob(data_gen_path + '.' + sub_folder_name + "*")
+
+            if mode == 'train':
+                train_path = join(path, algo)
+                sub_folder_name = '.'.join([family, current_matrix_type, current_matrix_shape])
+                path_subdir = glob.glob(train_path + '.' + sub_folder_name + "*")
+
+            path_folders = list(filter(lambda x: os.path.isdir(x), path_subdir))
+            folders.append(path_folders)
+
+    folders_flat = reduce(lambda x, y: x + y, folders)
+
+    return folders_flat