You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2018/05/18 03:37:53 UTC
systemml git commit: [SYSTEMML-2329] New sampling-based mm sparsity
estimator
Repository: systemml
Updated Branches:
refs/heads/master 79a5e80f6 -> 5a155f3d2
[SYSTEMML-2329] New sampling-based mm sparsity estimator
This patch introduces an additional baseline sparsity estimator based on
random sampling of columns from A and aligned rows from B for estimating
the output sparsity of AB.
Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/5a155f3d
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/5a155f3d
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/5a155f3d
Branch: refs/heads/master
Commit: 5a155f3d2402b1d77ca90a1d024cb04d6a2ca80d
Parents: 79a5e80
Author: Matthias Boehm <mb...@gmail.com>
Authored: Thu May 17 20:39:24 2018 -0700
Committer: Matthias Boehm <mb...@gmail.com>
Committed: Thu May 17 20:39:24 2018 -0700
----------------------------------------------------------------------
.../sysml/hops/estim/EstimatorSample.java | 111 +++++++++++++++++++
.../estim/CompressedSizeEstimatorSample.java | 10 +-
.../sysml/runtime/util/UtilFunctions.java | 14 ++-
.../functions/estim/OuterProductTest.java | 21 ++++
.../functions/estim/SquaredProductTest.java | 21 ++++
5 files changed, 166 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
new file mode 100644
index 0000000..6ed918a
--- /dev/null
+++ b/src/main/java/org/apache/sysml/hops/estim/EstimatorSample.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.hops.estim;
+
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.DMLRuntimeException;
+import org.apache.sysml.runtime.matrix.MatrixCharacteristics;
+import org.apache.sysml.runtime.matrix.data.DenseBlock;
+import org.apache.sysml.runtime.matrix.data.LibMatrixAgg;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
+
+/**
+ * This estimator implements an approach based on row/column sampling
+ * Yongyang Yu, MingJie Tang, Walid G. Aref, Qutaibah M. Malluhi, Mostafa M. Abbas, Mourad Ouzzani:
+ * In-Memory Distributed Matrix Computation Processing and Optimization. ICDE 2017: 1047-1058
+ *
+ * The basic idea is to draw random samples of aligned columns SA and rows SB,
+ * and compute the output nnz as max(nnz(SA_i)*nnz(SB_i)). However, this estimator is
+ * biased toward underestimation as the maximum is unlikely sampled and collisions are
+ * not accounted for.
+ */
+public class EstimatorSample extends SparsityEstimator
+{
+ private static final double SAMPLE_FRACTION = 0.1; //10%
+
+ private final double _frac;
+
+ public EstimatorSample() {
+ this(SAMPLE_FRACTION);
+ }
+
+ public EstimatorSample(double sampleFrac) {
+ if( sampleFrac < 0 || sampleFrac > 1.0 )
+ throw new DMLRuntimeException("Invalid sample fraction: "+sampleFrac);
+ _frac = sampleFrac;
+ }
+
+ @Override
+ public double estim(MMNode root) {
+ LOG.warn("Recursive estimates not supported by EstimatorSample, falling back to EstimatorBasicAvg.");
+ return new EstimatorBasicAvg().estim(root);
+ }
+
+ @Override
+ public double estim(MatrixBlock m1, MatrixBlock m2) {
+ //get sampled indexes
+ int k = m1.getNumColumns();
+ int[] ix = UtilFunctions.getSortedSampleIndexes(
+ k, (int)Math.max(k*_frac, 1));
+ //compute output sparsity
+ int[] cnnz = computeColumnNnz(m1, ix);
+ long nnzOut = 0;
+ for(int i=0; i<ix.length; i++)
+ nnzOut = Math.max(nnzOut, cnnz[i] * m2.recomputeNonZeros(ix[i], ix[i]));
+ return OptimizerUtils.getSparsity(
+ m1.getNumRows(), m2.getNumColumns(), nnzOut);
+ }
+
+ @Override
+ public double estim(MatrixCharacteristics mc1, MatrixCharacteristics mc2) {
+ LOG.warn("Meta-data-only estimates not supported by EstimatorSample, falling back to EstimatorBasicAvg.");
+ return new EstimatorBasicAvg().estim(mc1, mc2);
+ }
+
+ private int[] computeColumnNnz(MatrixBlock in, int[] ix) {
+ int[] nnz = new int[in.getNumColumns()];
+ //count column nnz brute force or selective
+ if( in.isInSparseFormat() ) {
+ SparseBlock sblock = in.getSparseBlock();
+ for( int i=0; i<in.getNumRows(); i++ ) {
+ if( sblock.isEmpty(i) ) continue;
+ LibMatrixAgg.countAgg(sblock.values(i), nnz,
+ sblock.indexes(i), sblock.pos(i), sblock.size(i));
+ }
+ }
+ else {
+ DenseBlock dblock = in.getDenseBlock();
+ for( int i=0; i<in.getNumRows(); i++ ) {
+ double[] avals = dblock.values(i);
+ int aix = dblock.pos(i);
+ for( int j=0; j<in.getNumColumns(); j++ )
+ nnz[j] += (avals[aix+j] != 0) ? 1 : 0;
+ }
+ }
+
+ //copy nnz into reduced vector
+ int[] ret = new int[ix.length];
+ for(int i=0; i<ix.length; i++)
+ ret[i] = nnz[ix[i]];
+ return ret;
+ }
+}
http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
index d4c829c..73c93bf 100644
--- a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -27,13 +27,13 @@ import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.analysis.UnivariateFunction;
import org.apache.commons.math3.analysis.solvers.UnivariateSolverUtils;
import org.apache.commons.math3.distribution.ChiSquaredDistribution;
-import org.apache.commons.math3.random.RandomDataGenerator;
import org.apache.sysml.runtime.compress.BitmapEncoder;
import org.apache.sysml.runtime.compress.ReaderColumnSelection;
import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
import org.apache.sysml.runtime.compress.UncompressedBitmap;
import org.apache.sysml.runtime.compress.utils.DblArray;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.util.UtilFunctions;
public class CompressedSizeEstimatorSample extends CompressedSizeEstimator
{
@@ -303,12 +303,8 @@ public class CompressedSizeEstimatorSample extends CompressedSizeEstimator
* @return sorted array of integers
*/
private static int[] getSortedUniformSample(int range, int smplSize) {
- if (smplSize == 0)
- return new int[] {};
- RandomDataGenerator rng = new RandomDataGenerator();
- int[] sample = rng.nextPermutation(range, smplSize);
- Arrays.sort(sample);
- return sample;
+ if (smplSize == 0) return new int[] {};
+ return UtilFunctions.getSortedSampleIndexes(range, smplSize);
}
http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
index e32cb22..58dde27 100644
--- a/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
+++ b/src/main/java/org/apache/sysml/runtime/util/UtilFunctions.java
@@ -29,6 +29,7 @@ import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.math3.random.RandomDataGenerator;
import org.apache.sysml.parser.Expression.ValueType;
import org.apache.sysml.runtime.matrix.MetaDataNumItemsByEachReducer;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
@@ -515,17 +516,22 @@ public class UtilFunctions
return 0; //equal
}
- public static boolean isIntegerNumber( String str )
- {
+ public static boolean isIntegerNumber( String str ) {
byte[] c = str.getBytes();
for( int i=0; i<c.length; i++ )
if( c[i] < 48 || c[i] > 57 )
return false;
return true;
}
+
+ public static int[] getSortedSampleIndexes(int range, int sampleSize) {
+ RandomDataGenerator rng = new RandomDataGenerator();
+ int[] sample = rng.nextPermutation(range, sampleSize);
+ Arrays.sort(sample);
+ return sample;
+ }
- public static byte max( byte[] array )
- {
+ public static byte max( byte[] array ) {
byte ret = Byte.MIN_VALUE;
for( int i=0; i<array.length; i++ )
ret = (array[i]>ret)?array[i]:ret;
http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
index f45ca70..70facf7 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/estim/OuterProductTest.java
@@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst;
import org.apache.sysml.hops.estim.EstimatorBitsetMM;
import org.apache.sysml.hops.estim.EstimatorDensityMap;
import org.apache.sysml.hops.estim.EstimatorMatrixHistogram;
+import org.apache.sysml.hops.estim.EstimatorSample;
import org.apache.sysml.hops.estim.SparsityEstimator;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -118,6 +119,26 @@ public class OuterProductTest extends AutomatedTestBase
runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, k, n, case2);
}
+ @Test
+ public void testSamplingDefCase1() {
+ runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1);
+ }
+
+ @Test
+ public void testSamplingDefCase2() {
+ runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2);
+ }
+
+ @Test
+ public void testSampling20Case1() {
+ runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case1);
+ }
+
+ @Test
+ public void testSampling20Case2() {
+ runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case2);
+ }
+
private void runSparsityEstimateTest(SparsityEstimator estim, int m, int k, int n, double[] sp) {
MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, "uniform", 3);
MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, "uniform", 3);
http://git-wip-us.apache.org/repos/asf/systemml/blob/5a155f3d/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
index 204842c..917764b 100644
--- a/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
+++ b/src/test/java/org/apache/sysml/test/integration/functions/estim/SquaredProductTest.java
@@ -25,6 +25,7 @@ import org.apache.sysml.hops.estim.EstimatorBasicWorst;
import org.apache.sysml.hops.estim.EstimatorBitsetMM;
import org.apache.sysml.hops.estim.EstimatorDensityMap;
import org.apache.sysml.hops.estim.EstimatorMatrixHistogram;
+import org.apache.sysml.hops.estim.EstimatorSample;
import org.apache.sysml.hops.estim.SparsityEstimator;
import org.apache.sysml.runtime.instructions.InstructionUtils;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
@@ -123,6 +124,26 @@ public class SquaredProductTest extends AutomatedTestBase
runSparsityEstimateTest(new EstimatorMatrixHistogram(true), m, k, n, case2);
}
+ @Test
+ public void testSamplingDefCase1() {
+ runSparsityEstimateTest(new EstimatorSample(), m, k, n, case1);
+ }
+
+ @Test
+ public void testSamplingDefCase2() {
+ runSparsityEstimateTest(new EstimatorSample(), m, k, n, case2);
+ }
+
+ @Test
+ public void testSampling20Case1() {
+ runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case1);
+ }
+
+ @Test
+ public void testSampling20Case2() {
+ runSparsityEstimateTest(new EstimatorSample(0.2), m, k, n, case2);
+ }
+
private void runSparsityEstimateTest(SparsityEstimator estim, int m, int k, int n, double[] sp) {
MatrixBlock m1 = MatrixBlock.randOperations(m, k, sp[0], 1, 1, "uniform", 3);
MatrixBlock m2 = MatrixBlock.randOperations(k, n, sp[1], 1, 1, "uniform", 3);