You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by mb...@apache.org on 2021/10/11 19:05:58 UTC

[systemds] branch master updated: [SYSTEMDS-3158] Fix value-bias in ultra-sparse random matrix generation

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new d222f35  [SYSTEMDS-3158] Fix value-bias in ultra-sparse random matrix generation
d222f35 is described below

commit d222f352e5931718ca96b0b1cfc1dcd4bfac818c
Author: Matthias Boehm <mb...@gmail.com>
AuthorDate: Mon Oct 11 21:04:21 2021 +0200

    [SYSTEMDS-3158] Fix value-bias in ultra-sparse random matrix generation
    
    For ultra-sparse random matrix generation we have two random number
    generators, one for skips to get the next position, and one for values.
    In contrast, to dense and sparse random matrix generation, this means
    the number of calls to theses generators is equivalent. So far, we
    initialized both PRNGs with the same seed, leading to correlated
    randomness. For ultra-sparse matrices, where only a fraction of 1k-x-1k
    blocks has values, we end up with bias values.
    
    The fix is simply: we get the first random number from the one PRNG to
    initialize the other PRNG, which keeps it deterministic for both local
    and distributed operations without direct correlation.
    
    This patch also adds related tests, where without the change the
    ultra-sparse test instances would fail.
    
    Thanks to @Baunsgaard for catching this issue.
---
 .../runtime/matrix/data/LibMatrixDatagen.java      |  6 ++-
 .../sysds/test/component/matrix/RandTest.java      | 59 ++++++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java
index b547b4f..180208e 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixDatagen.java
@@ -501,8 +501,10 @@ public class LibMatrixDatagen
 				// Initialize the PRNGenerator for determining cells that contain a non-zero value
 				// Note that, "pdf" parameter applies only to cell values and the individual cells 
 				// are always selected uniformly at random.
-				nnzPRNG.setSeed(seed);
-				
+				// Also note that we cannot use the same seed here, because for ultra-sparse generation
+				// the number of calls to the valuePRNG and nnzPRNG are the same, thus creating correlated
+				// outcomes (bias toward the end of the value range)
+				nnzPRNG.setSeed((long)(valuePRNG.nextDouble()*Long.MAX_VALUE));
 				boolean localSparse = sparsity < 1 && MatrixBlock.evalSparseFormatInMemory(
 					blockrows, blockcols, (long)(sparsity*blockrows*blockcols));
 				if ( localSparse) {
diff --git a/src/test/java/org/apache/sysds/test/component/matrix/RandTest.java b/src/test/java/org/apache/sysds/test/component/matrix/RandTest.java
new file mode 100644
index 0000000..0376d79
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/matrix/RandTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrix;
+
+import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class RandTest {
+	@Test
+	public void dense_0_1_Test() {
+		checkRand(1000, 1000, 0.9, 0, 1, 7);
+	}
+	@Test
+	public void dense_10_100_Test() {
+		checkRand(1000, 1000, 0.9, 10, 100, 7);
+	}
+	@Test
+	public void sparse_0_1_Test() {
+		checkRand(10000, 10000, 0.01, 0, 1, 7);
+	}
+	@Test
+	public void sparse_10_100_Test() {
+		checkRand(10000, 10000, 0.01, 10, 100, 7);
+	}
+	@Test
+	public void ultrasparse_0_1_Test() {
+		checkRand(10000000, 100000, 1e-8, 0, 1, 7);
+	}
+	@Test
+	public void ultrasparse_10_100_Test() {
+		checkRand(10000000, 100000, 1e-8, 10, 100, 7);
+	}
+	private static void checkRand(int rows, int cols, double sparsity, double min, double max, int seed) {
+		MatrixBlock tmp = MatrixBlock.randOperations(rows, cols, sparsity, min, max, "uniform", seed);
+		double actual = tmp.sum();
+		double expected = (min + (max-min)/2)
+			* OptimizerUtils.getNnz(rows, cols, sparsity);
+		Assert.assertEquals(expected, actual, expected * 0.01); //1% range
+	}
+}