You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/09/02 22:16:14 UTC
[systemds] 02/02: [SYSTEMDS-3115,
3120] Implements cleaning pipeline enumeration scripts
This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
commit 3d1ba3c3b53f48d971c358a99f7c8b3e0b609c2b
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Fri Sep 3 00:15:05 2021 +0200
[SYSTEMDS-3115,3120] Implements cleaning pipeline enumeration scripts
* Parfor parallelization logical pipeline enumeration
* Various vectorization in the correctTypos builtin
---
scripts/builtin/correctTypos.dml | 51 +++++---------------------
scripts/pipelines/scripts/enumerateLogical.dml | 25 ++++++-------
2 files changed, 21 insertions(+), 55 deletions(-)
diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 9f95a4d..45d3861 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -123,11 +123,7 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
A = ascii_matrix[1:as.scalar(lengths[i,1]), i];
B = ascii_matrix[1:as.scalar(lengths[j,1]), j];
d = damerauLevenshteinDistanceBound(A, B, distance_threshold, FALSE);
- if (d == -1) {
- distance_matrix[i, j] = 42000;
- } else {
- distance_matrix[i, j] = d;
- }
+ distance_matrix[i, j] = ifelse(d == -1, 42000, d);
}
}
}
@@ -178,36 +174,26 @@ s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double
}
}
-
replaceStrings = function(String replacement, String to_replace, Frame[String] strings)
return(Frame[String] strings)
{
- for (i in 1:nrow(strings)) {
- if (as.scalar(strings[i,]) == to_replace) {
- strings[i,] = replacement;
- }
- }
+ strings = map(strings, "s -> s.equals(\""+to_replace+"\") ? \""+replacement+"\" : s");
}
-
insertOrIncrement = function(String str, Frame[Unknown] dict)
return(Frame[Unknown] dict)
{
i = 1;
- ret = FALSE;
break = FALSE;
while (i <= nrow(dict) & !break) {
if (as.scalar(dict[i, 1]) == str) {
- value = as.integer(as.scalar(dict[i, 2])) + 1;
- dict[i, 2] = value;
- contains = TRUE;
+ dict[i, 2] = as.frame(as.integer(as.scalar(dict[i, 2])) + 1);
break = TRUE;
}
i = i + 1;
}
- if (!break) {
+ if (!break)
dict = rbind(dict, cbind(as.frame(str), as.frame(1)));
- }
}
@@ -216,19 +202,11 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d
dl_matrix = matrix(0, rows = length(A) + 1, cols = length(B) + 1);
dl_matrix[length(A) + 1, length(B) + 1] = -1;
-
- for (j in 2:length(B) + 1) {
- dl_matrix[1, j] = j - 1;
- }
-
+ dl_matrix[1, 2:(length(B)+1)] = t(seq(2,length(B)+1) - 1);
dl_matrix[2, 1] = 1;
for (j in 2:length(B) + 1) {
- if (as.scalar(A[1]) == as.scalar(B[j - 1])) {
- cost = 0;
- } else {
- cost = 1;
- }
+ cost = as.integer(as.scalar(A[1]) != as.scalar(B[j - 1]))
dl_matrix[2, j] = min(min(
dl_matrix[2, j - 1] + 1,
dl_matrix[1, j] + 1),
@@ -241,23 +219,14 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d
i += 1;
dl_matrix[i, 1] = i - 1;
-
- if (as.scalar(A[i - 1]) == as.scalar(B[1])) {
- cost = 0;
- } else {
- cost = 1;
- }
+ cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[1]))
dl_matrix[i, 2] = min(min(
dl_matrix[i - 1, 2] + 1,
dl_matrix[i, 1] + 1),
dl_matrix[i - 1, 1] + cost);
for (j in 3:length(B) + 1) {
- if (as.scalar(A[i - 1]) == as.scalar(B[j - 1])) {
- cost = 0;
- } else {
- cost = 1;
- }
+ cost = as.integer(as.scalar(A[i - 1]) != as.scalar(B[j - 1]))
if (as.scalar(A[i - 1]) == as.scalar(B[j - 2]) & as.scalar(A[i - 2]) == as.scalar(B[j - 1])) {
dl_matrix[i, j] = min(min(
dl_matrix[i, j - 1] + 1,
@@ -272,9 +241,7 @@ damerauLevenshteinDistanceBound = function(matrix[double] A, matrix[double] B, d
}
}
- if( min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound) {
- break_condition = TRUE;
- }
+ break_condition = min(dl_matrix[i - 1, ]) > bound & min(dl_matrix[i, ]) > bound;
}
if (is_verbose){
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index f894c4e..1133eb7 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -67,9 +67,9 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
{
physicalPipList = list()
logicalPipList = list()
+
# # # get the physical instances from logical ones
- for(i in 1:nrow(population))
- {
+ for(i in 1:nrow(population)) {
lv = as.integer(as.scalar(population[i, 1])) + 1
lp = population[i, 2:lv]
physicalConf = bandit::get_physical_configurations(lp, num_inst, primitives)
@@ -77,23 +77,22 @@ return (Frame[Unknown] bestLg, Double pre_best, Double T)
logicalPipList = append(logicalPipList, lp)
}
- scores = matrix(0, rows=length(physicalPipList), cols=1)
-
# # # execute the physical pipelines
- for(i in 1:length(physicalPipList))
- {
- physicalConf = as.frame(physicalPipList[i])
- lp = as.frame(logicalPipList[i])
+ scores = matrix(0, length(physicalPipList), 1)
+ # TODO better parfor-dep handling of multi-assignments to avoid check=0
+ parfor(i in 1:length(physicalPipList), check=0) {
+ lp2 = as.frame(logicalPipList[i,1])
+ pp2 = as.frame(physicalPipList[i,1])
# # append configuration keys for extracting the pipeline later on
- id = seq(1, nrow(physicalConf))
- physicalConf = cbind(as.frame(id), physicalConf)
+ id = seq(1, nrow(pp2))
+ idpp = cbind(as.frame(id), pp2)
# # execute the physical instances and store the minimum scores, each pipeline is executed num_exec times
- [outPip,outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp, physicalConf, num_exec, X, y, Xtest, ytest, metaList,
+ [outPip, outHp, feaFrameOuter] = bandit::run_with_hyperparam(lp2, idpp, num_exec, X, y, Xtest, ytest, metaList,
evaluationFunc, evalFunHp, param, as.frame(""), cv, cvk, verbose)
# # sort the configurations groupwise
- max_perf = bandit::getMaxPerConf(outPip, nrow(physicalConf))
- scores[i] = as.matrix(max_perf[1, 1])
+ max_perf = bandit::getMaxPerConf(outPip, nrow(pp2))
+ scores[i,1] = as.matrix(max_perf[1,1])
}
# # select parents and best score