You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/07/05 15:19:52 UTC

[systemds] branch master updated: [MINOR] Adding null-mask in correctTypos for removing missing values from lookup dictionary.

This is an automated email from the ASF dual-hosted git repository.

ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 8948ff5  [MINOR] Adding null-mask in correctTypos for removing missing values from lookup dictionary.
8948ff5 is described below

commit 8948ff59d5a20e52c2f14ed26ad3c66a21efcce5
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Mon Jul 5 17:05:10 2021 +0200

    [MINOR] Adding null-mask in correctTypos for removing missing values from lookup dictionary.
---
 scripts/builtin/correctTypos.dml                     | 14 +++++++++-----
 src/test/scripts/functions/builtin/correct_typos.dml |  3 ++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/scripts/builtin/correctTypos.dml b/scripts/builtin/correctTypos.dml
index 1fdefb3..9f95a4d 100644
--- a/scripts/builtin/correctTypos.dml
+++ b/scripts/builtin/correctTypos.dml
@@ -53,13 +53,12 @@
 
 # TODO: future: add parameter for list of words that are sure to be correct
 
-s_correctTypos = function(Frame[String] strings, Double frequency_threshold=0.05, Integer distance_threshold=2,
+s_correctTypos = function(Frame[String] strings, Matrix[Double] nullMask, Double frequency_threshold=0.05, Integer distance_threshold=2,
     Boolean decapitalize=TRUE, Boolean correct=TRUE, Boolean is_verbose=FALSE)
   return (Frame[String] Y)
 {
   if(is_verbose)
     print ("BEGIN CORRECT-TYPOS SCRIPT");
-
   num_strings = length(strings);
 
   if(is_verbose)
@@ -67,16 +66,21 @@ s_correctTypos = function(Frame[String] strings, Double frequency_threshold=0.05
 
   if (decapitalize)
     strings = map(strings, "s -> s.toLowerCase()");
+    
+  if(nrow(strings) != nrow(nullMask) | ncol(strings) != ncol(nullMask))
+    stop("Dimension mismatch: data dimensions do not match with mask dimensions")
   Y = strings
 
   # build dictionary
   current_string = as.scalar(strings[1]);
-  dict = cbind(as.frame(current_string), as.frame(1));
+  dict = cbind(as.frame(""), as.frame(1));
 
-  for (i in 2:num_strings) {
+  for (i in 1:num_strings) {
     current_string = as.scalar(strings[i]);
-    dict = insertOrIncrement(current_string, dict);
+    if(as.scalar(nullMask[i]) == 0)
+      dict = insertOrIncrement(current_string, dict);
   }
+  dict = dict[2:nrow(dict),]
   strings = dict[,1];
   frequencies = as.matrix(dict[,2]) / num_strings;
   lengths = as.matrix(map(strings, "s -> s.length()"));
diff --git a/src/test/scripts/functions/builtin/correct_typos.dml b/src/test/scripts/functions/builtin/correct_typos.dml
index 02fe42a..2b8bd3c 100644
--- a/src/test/scripts/functions/builtin/correct_typos.dml
+++ b/src/test/scripts/functions/builtin/correct_typos.dml
@@ -20,5 +20,6 @@
 #-------------------------------------------------------------
 
 X = read($X, data_type="frame", format="csv", header=FALSE);
-Y = correctTypos(X, $frequency_threshold, $distance_threshold, $decapitalize, $correct, $is_verbose);
+nullMask = matrix(0, rows=nrow(X), cols=ncol(X))
+Y = correctTypos(X, nullMask, $frequency_threshold, $distance_threshold, $decapitalize, $correct, $is_verbose);
 write(Y, $Y, format="csv")