You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2022/08/04 21:57:10 UTC

[systemds] branch main updated (75385a9493 -> 469d158568)

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


    from 75385a9493 [SYSTEMDS-3390] Improve performance of countDistinctApprox()
     new 45d97202f6 [MINOR] Fix builtin/parser datatype and size propagation issues
     new 469d158568 [SYSTEMDS-3414] Fix allocation of large, multi-array dense matrices

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 scripts/builtin/executePipeline.dml                            |  8 ++++----
 scripts/builtin/imputeByFD.dml                                 |  6 +++---
 .../org/apache/sysds/parser/BuiltinFunctionExpression.java     |  5 +++--
 .../sysds/runtime/controlprogram/caching/ByteBuffer.java       |  5 +++--
 .../java/org/apache/sysds/runtime/data/DenseBlockLBool.java    |  2 +-
 .../java/org/apache/sysds/runtime/data/DenseBlockLDRB.java     | 10 ++++++++--
 6 files changed, 22 insertions(+), 14 deletions(-)


[systemds] 02/02: [SYSTEMDS-3414] Fix allocation of large, multi-array dense matrices

Posted by mb...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 469d15856878a3de56b5da98dfe8513a9461ccd1
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Thu Aug 4 23:33:12 2022 +0200

    [SYSTEMDS-3414] Fix allocation of large, multi-array dense matrices
    
    This patch fixes the allocation logic for large dense matrix blocks
    which internally use multiple arrays of size <= int_max in a row-aligned
    manner. While this works well if the row-alignment causes enough head
    room to max_integer, the actual maximum allocation is platform-specific.
    For example, on the following scenario it failed with
    OutOfMemoryError: Requested array size exceeds VM limit:
    
    INT_MAX:   2147483648
    ALLOCATED: 2147483646 --> error despite <= int_max
    
    We now leave explicit head room of 8 cells for platform-specific padding
    to prevent such errors. The same bound is also applied to the byte
    buffers in the lazy write cache (buffer pool).
---
 .../sysds/runtime/controlprogram/caching/ByteBuffer.java       |  5 +++--
 .../java/org/apache/sysds/runtime/data/DenseBlockLBool.java    |  2 +-
 .../java/org/apache/sysds/runtime/data/DenseBlockLDRB.java     | 10 ++++++++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java
index a0c699c268..0466035998 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/caching/ByteBuffer.java
@@ -25,6 +25,7 @@ import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.IOException;
 
+import org.apache.sysds.runtime.data.DenseBlockLDRB;
 import org.apache.sysds.runtime.matrix.data.FrameBlock;
 import org.apache.sysds.runtime.matrix.data.MatrixBlock;
 import org.apache.sysds.runtime.util.LocalFileUtils;
@@ -36,7 +37,7 @@ import org.apache.sysds.runtime.util.LocalFileUtils;
  */
 public class ByteBuffer
 {
-	private volatile boolean _serialized;	
+	private volatile boolean _serialized;
 	private volatile boolean _shallow;
 	private volatile boolean _matrix;
 	private final long _size;
@@ -167,7 +168,7 @@ public class ByteBuffer
 		if( !cb.isShallowSerialize(true) ) { //SPARSE matrix blocks
 			// since cache blocks are serialized into a byte representation
 			// the buffer buffer can hold at most 2GB in size 
-			return ( size <= Integer.MAX_VALUE );
+			return ( size <= DenseBlockLDRB.MAX_ALLOC );
 		}
 		else {//DENSE/SPARSE matrix / frame blocks
 			// for dense and under special conditions also sparse matrix blocks 
diff --git a/src/main/java/org/apache/sysds/runtime/data/DenseBlockLBool.java b/src/main/java/org/apache/sysds/runtime/data/DenseBlockLBool.java
index 705f894241..1282d98bf2 100644
--- a/src/main/java/org/apache/sysds/runtime/data/DenseBlockLBool.java
+++ b/src/main/java/org/apache/sysds/runtime/data/DenseBlockLBool.java
@@ -74,7 +74,7 @@ public class DenseBlockLBool extends DenseBlockLDRB
 		// Special implementation to make computeNnz fast if complete block is read
 		boolean bv = v != 0;
 		long dataLength = (long) rlen * odims[0];
-		int newBlockSize = Math.min(rlen, Integer.MAX_VALUE / odims[0]);
+		int newBlockSize = Math.min(rlen, MAX_ALLOC / odims[0]);
 		int numBlocks = UtilFunctions.toInt(Math.ceil((double) rlen / newBlockSize));
 		if (_blen == newBlockSize && dataLength <= capacity()) {
 			for (int i = 0; i < numBlocks; i++) {
diff --git a/src/main/java/org/apache/sysds/runtime/data/DenseBlockLDRB.java b/src/main/java/org/apache/sysds/runtime/data/DenseBlockLDRB.java
index 339dbb5069..7b23c4c4a9 100644
--- a/src/main/java/org/apache/sysds/runtime/data/DenseBlockLDRB.java
+++ b/src/main/java/org/apache/sysds/runtime/data/DenseBlockLDRB.java
@@ -33,6 +33,12 @@ public abstract class DenseBlockLDRB extends DenseBlock
 {
 	private static final long serialVersionUID = -7519435549328146356L;
 
+	// On same platforms, allocating arrays very close to INT_MAX runs into
+	// a "java.lang.OutOfMemoryError: Requested array size exceeds VM limit"
+	// Also, this normally does not happen because we allocate row-aligned
+	// chunks, to be on the safe side, we keep a margin of 8 for padding.
+	public static int MAX_ALLOC = Integer.MAX_VALUE - 8;
+	
 	protected int _blen;
 
 	protected DenseBlockLDRB(int[] dims) {
@@ -49,7 +55,7 @@ public abstract class DenseBlockLDRB extends DenseBlock
 
 	@Override
 	public int blockSize() {
-	    return _blen;
+		return _blen;
 	}
 
 	@Override
@@ -60,7 +66,7 @@ public abstract class DenseBlockLDRB extends DenseBlock
 	@Override
 	public void reset(int rlen, int[] odims, double v) {
 		long dataLength = (long) rlen * odims[0];
-		int newBlockSize = Math.min(rlen, Integer.MAX_VALUE / odims[0]);
+		int newBlockSize = Math.min(rlen, MAX_ALLOC / odims[0]);
 		int numBlocks = UtilFunctions.toInt(Math.ceil((double) rlen / newBlockSize));
 		if (_blen == newBlockSize && dataLength <= capacity()) {
 			IntStream.range(0, numBlocks)


[systemds] 01/02: [MINOR] Fix builtin/parser datatype and size propagation issues

Posted by mb...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git

commit 45d97202f64b00adb3a12c54988b185b6f12da27
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Thu Aug 4 22:42:19 2022 +0200

    [MINOR] Fix builtin/parser datatype and size propagation issues
---
 scripts/builtin/executePipeline.dml                               | 8 ++++----
 scripts/builtin/imputeByFD.dml                                    | 6 +++---
 .../java/org/apache/sysds/parser/BuiltinFunctionExpression.java   | 5 +++--
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index d6729e72c5..cfd1899d96 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -386,13 +386,13 @@ return (Matrix[Double] X, Matrix[Double] Y)
       synthesized = matrix(0,0,0) # initialize variable
       start_class = 1
       end_class = 0
-      k = table(XY[, 1], 1)
-      getMax = max(k)
-      maxKIndex = as.scalar(rowIndexMax(t(k)))
+      kmat = table(XY[, 1], 1)
+      getMax = max(kmat)
+      maxKIndex = as.scalar(rowIndexMax(t(kmat)))
       outSet = matrix(0, 0, ncol(XY))
       remainingRatio = ifelse((remainingRatio%%100) >= 50, remainingRatio+(100 - (remainingRatio%%100)),
       remainingRatio-(remainingRatio%%100))
-      for(i in 1: nrow(k), check=0) {
+      for(i in 1: nrow(kmat), check=0) {
         end_class = end_class + as.scalar(classes[i])
         class_t = XY[start_class:end_class, ]
         if((i != maxKIndex) & (nrow(class_t) > 1)) {
diff --git a/scripts/builtin/imputeByFD.dml b/scripts/builtin/imputeByFD.dml
index 2f078c056a..8b3af64635 100644
--- a/scripts/builtin/imputeByFD.dml
+++ b/scripts/builtin/imputeByFD.dml
@@ -42,10 +42,10 @@ m_imputeByFD = function(Matrix[Double] X, Matrix[Double] Y, Double threshold, Bo
   if( threshold < 0 | threshold > 1 )
     stop("Stopping due to invalid input, threshold required in interval [0, 1] found "+threshold)
 
-  if(min(X) < 1 | min(Y) < 1)
-  {
+  if(min(X) < 1 | min(Y) < 1) {
     print("imputeByFD: source or target contain values less than 1")
-    
+    Y = matrix(0, 1, 1);
+    Y_imp = matrix(0, 1, 1);
   }
   else {
     # impute missing values and fix errors
diff --git a/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java b/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java
index 0da5609afd..c5db658dfa 100644
--- a/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java
+++ b/src/main/java/org/apache/sysds/parser/BuiltinFunctionExpression.java
@@ -683,11 +683,12 @@ public class BuiltinFunctionExpression extends DataIdentifier
 			// cumsum(X);
 			checkNumParameters(1);
 			checkMatrixParam(getFirstExpr());
-			if( getOpCode() == Builtins.CUMSUMPROD && id.getDim2() > 2 )
+			boolean cumSP = getOpCode() == Builtins.CUMSUMPROD;
+			if( cumSP && id.getDim2() > 2 )
 				raiseValidateError("Cumsumprod only supported over two-column matrices", conditional);
 			
 			output.setDataType(DataType.MATRIX);
-			output.setDimensions(id.getDim1(), id.getDim2());
+			output.setDimensions(id.getDim1(), cumSP ? 1 : id.getDim2());
 			output.setBlocksize (id.getBlocksize());
 			output.setValueType(id.getValueType());