You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/07/05 15:19:52 UTC
[systemds] branch master updated: [MINOR] Adding null-mask in
correctTypos for removing missing values from lookup dictionary.
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 8948ff5 [MINOR] Adding null-mask in correctTypos for removing missing values from lookup dictionary.
8948ff5 is described below
commit 8948ff59d5a20e52c2f14ed26ad3c66a21efcce5
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Mon Jul 5 17:05:10 2021 +0200
[MINOR] Adding null-mask in correctTypos for removing missing values from lookup dictionary.
---
scripts/builtin/correctTypos.dml | 14 +++++++++-----
src/test/scripts/functions/builtin/correct_typos.dml | 3 ++-
2 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 1fdefb3..9f95a4d 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -53,13 +53,12 @@
# TODO: future: add parameter for list of words that are sure to be correct
-s_correctTypos = function(Frame[String] strings, Double frequency_threshold=0.05, Integer distance_threshold=2,
+s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double frequency_threshold=0.05, Integer distance_threshold=2,
Boolean decapitalize=TRUE, Boolean correct=TRUE, Boolean is_verbose=FALSE)
return (Frame[String] Y)
{
if(is_verbose)
print ("BEGIN CORRECT-TYPOS SCRIPT");
-
num_strings = length(strings);
if(is_verbose)
@@ -67,16 +66,21 @@ s_correctTypos = function(Frame[String] strings, Double frequency_threshold=0.05
if (decapitalize)
strings = map(strings, "s -> s.toLowerCase()");
+
+ if(nrow(strings) != nrow(nullMask) | ncol(strings) != ncol(nullMask))
+ stop("Dimension mismatch: data dimensions do not match with mask dimensions")
Y = strings
# build dictionary
current_string = as.scalar(strings[1]);
- dict = cbind(as.frame(current_string), as.frame(1));
+ dict = cbind(as.frame(""), as.frame(1));
- for (i in 2:num_strings) {
+ for (i in 1:num_strings) {
current_string = as.scalar(strings[i]);
- dict = insertOrIncrement(current_string, dict);
+ if(as.scalar(nullMask[i]) == 0)
+ dict = insertOrIncrement(current_string, dict);
}
+ dict = dict[2:nrow(dict),]
strings = dict[,1];
frequencies = as.matrix(dict[,2]) / num_strings;
lengths = as.matrix(map(strings, "s -> s.length()"));
diff --git a/src/test/scripts/functions/builtin/correct_typos.dml b/src/test/scripts/functions/builtin/correct_typos.dml
index 02fe42a..2b8bd3c 100644
--- a/src/test/scripts/functions/builtin/correct_typos.dml
+++ b/src/test/scripts/functions/builtin/correct_typos.dml
@@ -20,5 +20,6 @@
#-------------------------------------------------------------
X = read($X, data_type="frame", format="csv", header=FALSE);
-Y = correctTypos(X, $frequency_threshold, $distance_threshold, $decapitalize, $correct, $is_verbose);
+nullMask = matrix(0, rows=nrow(X), cols=ncol(X))
+Y = correctTypos(X, nullMask, $frequency_threshold, $distance_threshold, $decapitalize, $correct, $is_verbose);
write(Y, $Y, format="csv")