You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ss...@apache.org on 2021/08/03 15:15:28 UTC
[systemds] branch master updated: [MINOR] Cleanup builtin tomeklink
(vectorizing, formatting)
This is an automated email from the ASF dual-hosted git repository.
ssiddiqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 77e861a [MINOR] Cleanup builtin tomeklink (vectorizing, formatting)
77e861a is described below
commit 77e861afca946702019e459df6d308ae84189718
Author: Shafaq Siddiqi <sh...@tugraz.at>
AuthorDate: Tue Aug 3 17:14:43 2021 +0200
[MINOR] Cleanup builtin tomeklink (vectorizing, formatting)
---
scripts/builtin/tomeklink.dml | 68 +++++++++++++++----------------------------
1 file changed, 24 insertions(+), 44 deletions(-)
diff --git a/scripts/builtin/tomeklink.dml b/scripts/builtin/tomeklink.dml
index 71b2a75..3b4fc6f 100644
--- a/scripts/builtin/tomeklink.dml
+++ b/scripts/builtin/tomeklink.dml
@@ -28,7 +28,7 @@
# NAME TYPE DEFAULT MEANING
# ---------------------------------------------------------------------------------------------
# X MATRIX --- Data Matrix (nxm)
-# y MATRIX --- Label Matrix (nx1)
+# y MATRIX --- Label Matrix (nx1), greater than zero
# ---------------------------------------------------------------------------------------------
# OUTPUT:
# X_under - Data Matrix without Tomek links
@@ -36,37 +36,31 @@
# drop_idx - Indices of dropped rows/labels wrt input
-###### MAIN PART ######
-
m_tomeklink = function(Matrix[Double] X, Matrix[Double] y)
- return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx) {
- majority_label = 0
- n = nrow(X)
- m = ncol(X)
+return (Matrix[Double] X_under, Matrix[Double] y_under, Matrix[Double] drop_idx) {
+
+ ymin = min(y)
+ if(ymin == 0)
+ y = y + 1
+
+ # # find the majority labels
+ label = table(y, 1)
+ majority_label = as.scalar(rowIndexMax(t(label)))
tomek_links = get_links(X, y, majority_label)
+ drop_idx = tomek_links * seq(1, nrow(X))
+ X_under = removeEmpty(target=X, margin="rows", select = (tomek_links == 0))
+ y_under = removeEmpty(target=y, margin="rows", select = (tomek_links == 0))
+ drop_idx = removeEmpty(target=drop_idx, margin="rows", select = tomek_links)
+ if(ymin)
+ y = y - 1
- X_under = matrix(0, rows = 0, cols = m)
- y_under = matrix(0, rows = 0, cols = 1)
- drop_idx = matrix(0, rows = 0, cols = 1)
-
- for (i in 1:nrow(X)) {
- is_link = as.scalar(tomek_links[i, 1])
- if (is_link == 1) {
- X_under = rbind(X_under, X[i,])
- y_under = rbind(y_under, y[i,])
- drop_idx = rbind(drop_idx, matrix(i, rows = 1, cols = 1))
- }
- }
}
-###### END MAIN PART ######
-
-###### UTILS ######
-# nearest nb function ----------------------------------------------------------
+# get the nearest neighbour index
get_nn = function(Matrix[Double] X)
- return (Matrix[Double] nn) {
+return (Matrix[Double] nn) {
nn = matrix(0, rows = nrow(X), cols = 1)
for (i in 1:nrow(X)) {
dists = rowSums((X - X[i,])^2)
@@ -75,27 +69,13 @@ get_nn = function(Matrix[Double] X)
}
}
-# find tomek link function ----------------------------------------------------
+# find the tomek links
get_links = function(Matrix[Double] X, Matrix[Double] y, double majority_label)
- return (Matrix[Double] tomek_links) {
- tomek_links = matrix(0, rows = nrow(X), cols = 1)
+return (Matrix[Double] tomek_links) {
nn = get_nn(X)
-
- for (index in 1:nrow(X)) {
- # this is a tomek link according to R: ubTomek https://rdrr.io/cran/unbalanced/src/R/ubTomek.R
- # other sources define it as a pair of mutual nearest neighbor
- # where exactly one endpoint has the majority label
-
- nn_index = as.scalar(nn[index, 1])
- label = as.scalar(y[index, 1])
- nn_label = as.scalar(y[nn_index, 1])
-
- if (label != majority_label) {
- if (nn_label == majority_label) {
- tomek_links[nn_index, 1] = 1
- }
- }
- }
+ perm = table(seq(1, nrow(y)), nn, nrow(y), nrow(y))
+ nn_labels = perm %*% y
+ links = (y != majority_label) & (nn_labels == majority_label)
+ tomek_links = (table(nn, 1, links, nrow(y), 1) > 0)
}
-###### END UTILS ######