You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/09/15 12:59:57 UTC

[systemds] branch master updated: [SYSTEMDS-3134] Fix robustness transformapply for unknown categories

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new adb8af1  [SYSTEMDS-3134] Fix robustness transformapply for unknown categories
adb8af1 is described below

commit adb8af1d5f490d58635c6e27b55cc0dd00b80a43
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Wed Sep 15 14:39:23 2021 +0200

    [SYSTEMDS-3134] Fix robustness transformapply for unknown categories
    
    This patch fixes issues of the cleaning pipeline enumeration where
    transformapply corrupted the output sparse matrix with negative column
    indexes which then produce index out-of-bounds exceptions during sparse
    operations. We now handle these unknowns gracefully, but additional work
    is needed to set the outputs by position.
---
 .../java/org/apache/sysds/runtime/data/SparseBlockMCSR.java   | 10 ++++++----
 .../java/org/apache/sysds/runtime/data/SparseRowVector.java   |  5 +++++
 .../runtime/transform/encode/ColumnEncoderDummycode.java      | 11 ++++++++---
 .../pipelines/BuiltinTopkCleaningClassificationTest.java      |  2 --
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/data/SparseBlockMCSR.java b/src/main/java/org/apache/sysds/runtime/data/SparseBlockMCSR.java
index 159e581..a733ea9 100644
--- a/src/main/java/org/apache/sysds/runtime/data/SparseBlockMCSR.java
+++ b/src/main/java/org/apache/sysds/runtime/data/SparseBlockMCSR.java
@@ -195,7 +195,7 @@ public class SparseBlockMCSR extends SparseBlock
 			int[] aix = indexes(i);
 			double[] avals = values(i);
 			for (int k = apos + 1; k < apos + alen; k++) {
-				if (aix[k-1] >= aix[k])
+				if (aix[k-1] >= aix[k] | aix[k-1] < 0 )
 					throw new RuntimeException("Wrong sparse row ordering, at row="+i+", pos="+k
 						+ " with column indexes " + aix[k-1] + ">=" + aix[k]);
 				if (avals[k] == 0)
@@ -205,10 +205,12 @@ public class SparseBlockMCSR extends SparseBlock
 		}
 
 		//3. A capacity that is no larger than nnz times resize factor
-		for( int i=0; i<rlen; i++ )
-			if( !isEmpty(i) && values(i).length > nnz*RESIZE_FACTOR1 )
+		for( int i=0; i<rlen; i++ ) {
+			long max_size = (long)Math.max(nnz*RESIZE_FACTOR1, INIT_CAPACITY);
+			if( !isEmpty(i) && values(i).length > max_size )
 				throw new RuntimeException("The capacity is larger than nnz times a resize factor(=2). "
-					+ "Actual length = " + values(i).length+", should not exceed "+nnz*RESIZE_FACTOR1);
+					+ "Actual length = " + values(i).length+", should not exceed "+max_size);
+		}
 
 		return true;
 	}
diff --git a/src/main/java/org/apache/sysds/runtime/data/SparseRowVector.java b/src/main/java/org/apache/sysds/runtime/data/SparseRowVector.java
index 38a9aba..6d67707 100644
--- a/src/main/java/org/apache/sysds/runtime/data/SparseRowVector.java
+++ b/src/main/java/org/apache/sysds/runtime/data/SparseRowVector.java
@@ -195,6 +195,11 @@ public final class SparseRowVector extends SparseRow{
 		return true; // nnz++
 	}
 	
+	public void setAtPos(int pos, int col, double v) {
+		indexes[pos] = col;
+		values[pos] = v;
+	}
+	
 	@Override
 	public boolean add(int col, double v) {
 		//early abort on zero (if no overwrite)
diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderDummycode.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderDummycode.java
index 1047f54..3643d00 100644
--- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderDummycode.java
+++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderDummycode.java
@@ -75,12 +75,17 @@ public class ColumnEncoderDummycode extends ColumnEncoder {
 		for(int i = rowStart; i < getEndIndex(in.getNumRows(), rowStart, blk); i++) {
 			// Using outputCol here as index since we have a MatrixBlock as input where dummycoding could have been
 			// applied in a previous encoder
+			// FIXME: we need a clear way of separating input/output (org input, pre-allocated output)
+			// need input index to avoid inconsistencies; also need to set by position not binarysearch
 			double val = in.quickGetValueThreadSafe(i, outputCol);
 			int nCol = outputCol + (int) val - 1;
-			// Setting value to 0 first in case of sparse so the row vector does not need to be resized
-			if(nCol != outputCol)
+			// Set value, w/ robustness for val=NaN (unknown categories)
+			if( nCol >= 0 && !Double.isNaN(val) ) { // filter unknowns
+				out.quickSetValue(i, outputCol, 0); //FIXME remove this workaround (see above)
+				out.quickSetValue(i, nCol, 1);
+			}
+			else
 				out.quickSetValue(i, outputCol, 0);
-			out.quickSetValue(i, nCol, 1);
 		}
 		if (DMLScript.STATISTICS)
 			Statistics.incTransformDummyCodeApplyTime(System.nanoTime()-t0);
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index 0c91513..47e1347 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -45,8 +45,6 @@ public class BuiltinTopkCleaningClassificationTest extends AutomatedTestBase {
 		addTestConfiguration(TEST_NAME, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME, new String[]{"R"}));
 	}
 
-	// TODO fixing ArrayIndexOutOfBounds exception
-	@Ignore
 	public void testFindBestPipelineCompany() {
 		runtopkCleaning(DATA_DIR+ "company.csv", RESOURCE+ "meta/meta_company.csv", 1.0, 3,5,
 			"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);