You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/05/18 03:37:53 UTC

systemml git commit: [SYSTEMML-2329] New sampling-based mm sparsity estimator

Repository: systemml
Updated Branches:
  refs/heads/master 79a5e80f6 -> 5a155f3d2


[SYSTEMML-2329] New sampling-based mm sparsity estimator 

This patch introduces an additional baseline sparsity estimator based on
random sampling of columns from A and aligned rows from B for estimating
the output sparsity of AB.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5a155f3d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5a155f3d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5a155f3d

Branch: refs/heads/master
Commit: 5a155f3d2402b1d77ca90a1d024cb04d6a2ca80d
Parents: 79a5e80
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu May 17 20:39:24 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu May 17 20:39:24 2018 -0700

----------------------------------------------------------------------
 .../sysml/hops/estim/EstimatorSample.java       | 111 +++++++++++++++++++
 .../estim/CompressedSizeEstimatorSample.java    |  10 +-
 .../sysml/runtime/util/UtilFunctions.java       |  14 ++-
 .../functions/estim/OuterProductTest.java       |  21 ++++
 .../functions/estim/SquaredProductTest.java     |  21 ++++
 5 files changed, 166 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
new file mode 100644
index 0000000..6ed918a
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.estim;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.DenseBlock;
+import org.apache.sysml.runtime.matrix.data.LibMatrixAgg;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+/**
+ * This estimator implements an approach based on row/column sampling
+ * Yongyang Yu, MingJie Tang, Walid G. Aref, Qutaibah M. Malluhi, Mostafa M. Abbas, Mourad Ouzzani:
+ * In-Memory Distributed Matrix Computation Processing and Optimization. ICDE 2017: 1047-1058
+ * 
+ * The basic idea is to draw random samples of aligned columns SA and rows SB,
+ * and compute the output nnz as max(nnz(SA_i)*nnz(SB_i)). However, this estimator is
+ * biased toward underestimation as the maximum is unlikely sampled and collisions are
+ * not accounted for.
+ */
+public class EstimatorSample extends SparsityEstimator
+{
+	private static final double SAMPLE_FRACTION = 0.1; //10%
+	
+	private final double _frac;
+	
+	public EstimatorSample() {
+		this(SAMPLE_FRACTION);
+	}
+	
+	public EstimatorSample(double sampleFrac) {
+		if( sampleFrac < 0 || sampleFrac > 1.0 )
+			throw new DMLRuntimeException("Invalid sample fraction: "+sampleFrac);
+		_frac = sampleFrac;
+	}
+	
+	@Override
+	public double estim(MMNode root) {
+		LOG.warn("Recursive estimates not supported by EstimatorSample, falling back to EstimatorBasicAvg.");
+		return new EstimatorBasicAvg().estim(root);
+	}
+
+	@Override
+	public double estim(MatrixBlock m1, MatrixBlock m2) {
+		//get sampled indexes
+		int k = m1.getNumColumns();
+		int[] ix = UtilFunctions.getSortedSampleIndexes(
+			k, (int)Math.max(k*_frac, 1));
+		//compute output sparsity 
+		int[] cnnz = computeColumnNnz(m1, ix);
+		long nnzOut = 0;
+		for(int i=0; i<ix.length; i++)
+			nnzOut = Math.max(nnzOut, cnnz[i] * m2.recomputeNonZeros(ix[i], ix[i]));
+		return OptimizerUtils.getSparsity( 
+			m1.getNumRows(), m2.getNumColumns(), nnzOut);
+	}
+
+	@Override
+	public double estim(MatrixCharacteristics mc1, MatrixCharacteristics mc2) {
+		LOG.warn("Meta-data-only estimates not supported by EstimatorSample, falling back to EstimatorBasicAvg.");
+		return new EstimatorBasicAvg().estim(mc1, mc2);
+	}
+	
+	private int[] computeColumnNnz(MatrixBlock in, int[] ix) {
+		int[] nnz = new int[in.getNumColumns()];
+		//count column nnz brute force or selective
+		if( in.isInSparseFormat() ) {
+			SparseBlock sblock = in.getSparseBlock();
+			for( int i=0; i<in.getNumRows(); i++ ) {
+				if( sblock.isEmpty(i) ) continue;
+				LibMatrixAgg.countAgg(sblock.values(i), nnz,
+					sblock.indexes(i), sblock.pos(i), sblock.size(i));
+			}
+		}
+		else {
+			DenseBlock dblock = in.getDenseBlock();
+			for( int i=0; i<in.getNumRows(); i++ ) {
+				double[] avals = dblock.values(i);
+				int aix = dblock.pos(i);
+				for( int j=0; j<in.getNumColumns(); j++ )
+					nnz[j] += (avals[aix+j] != 0) ? 1 : 0;
+			}
+		}
+		
+		//copy nnz into reduced vector
+		int[] ret = new int[ix.length];
+		for(int i=0; i<ix.length; i++)
+			ret[i] = nnz[ix[i]];
+		return ret;
+	}
+}

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
index d4c829c..73c93bf 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -27,13 +27,13 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.commons.math3.analysis.UnivariateFunction;
 import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
 import org.apache.commons.math3.distribution.ChiSquaredDistribution;
-import org.apache.commons.math3.random.RandomDataGenerator;
 import org.apache.sysml.runtime.compress.BitmapEncoder;
 import org.apache.sysml.runtime.compress.ReaderColumnSelection;
 import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
 import org.apache.sysml.runtime.compress.UncompressedBitmap;
 import org.apache.sysml.runtime.compress.utils.DblArray;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
 
 public class CompressedSizeEstimatorSample extends CompressedSizeEstimator 
 {
@@ -303,12 +303,8 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator
 	 * @return sorted array of integers
 	 */
 	private static int[] getSortedUniformSample(int range, int smplSize) {
-		if (smplSize == 0)
-			return new int[] {};
-		RandomDataGenerator rng = new RandomDataGenerator();
-		int[] sample = rng.nextPermutation(range, smplSize);
-		Arrays.sort(sample);
-		return sample;
+		if (smplSize == 0) return new int[] {};
+		return UtilFunctions.getSortedSampleIndexes(range, smplSize);
 	}
 	
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index e32cb22..58dde27 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -29,6 +29,7 @@ import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 
 import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.math3.random.RandomDataGenerator;
 import org.apache.sysml.parser.Expression.ValueType;
 import org.apache.sysml.runtime.matrix.MetaDataNumItemsByEachReducer;
 import org.apache.sysml.runtime.matrix.data.FrameBlock;
@@ -515,17 +516,22 @@ public class UtilFunctions
 		return 0; //equal 
 	}
 	
-	public static boolean isIntegerNumber( String str )
-	{
+	public static boolean isIntegerNumber( String str ) {
 		byte[] c = str.getBytes();
 		for( int i=0; i<c.length; i++ )
 			if( c[i] < 48 || c[i] > 57 )
 				return false;
 		return true;
 	}
+	
+	public static int[] getSortedSampleIndexes(int range, int sampleSize) {
+		RandomDataGenerator rng = new RandomDataGenerator();
+		int[] sample = rng.nextPermutation(range, sampleSize);
+		Arrays.sort(sample);
+		return sample;
+	}
 
-	public static byte max( byte[] array )
-	{
+	public static byte max( byte[] array ) {
 		byte ret = Byte.MIN_VALUE;
 		for( int i=0; i<array.length; i++ )
 			ret = (array[i]>ret)?array[i]:ret;

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
index f45ca70..70facf7 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
@@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst;
 import org.apache.sysml.hops.estim.EstimatorBitsetMM;
 import org.apache.sysml.hops.estim.EstimatorDensityMap;
 import org.apache.sysml.hops.estim.EstimatorMatrixHistogram;
+import org.apache.sysml.hops.estim.EstimatorSample;
 import org.apache.sysml.hops.estim.SparsityEstimator;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -118,6 +119,26 @@ public class OuterProductTest extends AutomatedTestBase
 		runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, k, n, case2);
 	}
 	
+	@Test
+	public void testSamplingDefCase1() {
+		runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1);
+	}
+	
+	@Test
+	public void testSamplingDefCase2() {
+		runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2);
+	}
+	
+	@Test
+	public void testSampling20Case1() {
+		runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case1);
+	}
+	
+	@Test
+	public void testSampling20Case2() {
+		runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case2);
+	}
+	
 	private void runSparsityEstimateTest(SparsityEstimator estim, int m, int k, int n, double[] sp) {
 		MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, "uniform", 3);
 		MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, "uniform", 3);

http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
index 204842c..917764b 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
@@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst;
 import org.apache.sysml.hops.estim.EstimatorBitsetMM;
 import org.apache.sysml.hops.estim.EstimatorDensityMap;
 import org.apache.sysml.hops.estim.EstimatorMatrixHistogram;
+import org.apache.sysml.hops.estim.EstimatorSample;
 import org.apache.sysml.hops.estim.SparsityEstimator;
 import org.apache.sysml.runtime.instructions.InstructionUtils;
 import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -123,6 +124,26 @@ public class SquaredProductTest extends AutomatedTestBase
 		runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, k, n, case2);
 	}
 	
+	@Test
+	public void testSamplingDefCase1() {
+		runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1);
+	}
+	
+	@Test
+	public void testSamplingDefCase2() {
+		runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2);
+	}
+	
+	@Test
+	public void testSampling20Case1() {
+		runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case1);
+	}
+	
+	@Test
+	public void testSampling20Case2() {
+		runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case2);
+	}
+	
 	private void runSparsityEstimateTest(SparsityEstimator estim, int m, int k, int n, double[] sp) {
 		MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, "uniform", 3);
 		MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, "uniform", 3);