You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemml.apache.org by mb...@apache.org on 2016/07/17 00:23:29 UTC
[4/6] incubator-systemml git commit: [SYSTEMML-810] New compressed
matrix blocks and operations, tests
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCoder.java b/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCoder.java
new file mode 100644
index 0000000..07d9757
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCoder.java
@@ -0,0 +1,227 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.TreeMap;
+
+import org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator;
+
+public class PlanningCoCoder
+{
+ //constants for weight computation
+ private final static float GROUPABILITY_THRESHOLD = 0.00064f;
+ private final static boolean USE_BIN_WEIGHT = false;
+ private final static float PARTITION_WEIGHT = 0.05F; //higher values lead to more grouping
+ private final static float PARTITION_SIZE = PARTITION_WEIGHT * GROUPABILITY_THRESHOLD;
+ private final static float BIN_WEIGHT_PARAM = -0.65f; //lower values lead to more grouping
+
+ /**
+ *
+ * @param sizeEstimator
+ * @param availCols
+ * @param colsCardinalities
+ * @param compressedSize
+ * @param numRows
+ * @param sparsity
+ * @return
+ */
+ public static List<int[]> findCocodesByPartitioning(CompressedSizeEstimator sizeEstimator, List<Integer> availCols,
+ List<Integer> colsCardinalities,List<Long> compressedSize, int numRows, double sparsity)
+ {
+ float numRowsWeight = numRows;
+ List<int[]> retGroups = new ArrayList<int[]>();
+ // filtering out non-groupable columns as singleton groups
+ int numCols = availCols.size();
+ List<Integer> groupabaleCols = new ArrayList<Integer>();
+ // weighted of each column is the ratio of its cardinality to the number
+ // of rows scaled by the matrix sparsity
+ List<Float> groupabaleColWeights = new ArrayList<Float>();
+ HashMap<Integer, GroupableColInfo> groupableColsInfo = new HashMap<Integer, GroupableColInfo>();
+ for (int i = 0; i < numCols; i++) {
+ int colIx = availCols.get(i);
+ int cardinality = colsCardinalities.get(i);
+ float weight = ((float) cardinality) / numRowsWeight;
+ if (weight <= GROUPABILITY_THRESHOLD) {
+ groupabaleCols.add(colIx);
+ groupabaleColWeights.add(weight);
+ groupableColsInfo.put(colIx, new GroupableColInfo(weight,
+ compressedSize.get(i)));
+ } else {
+ retGroups.add(new int[] { colIx });
+ }
+ }
+ // bin packing based on PARTITION_WEIGHT and column weights
+ float weight = computeWeightForCoCoding(numRows, sparsity);
+ TreeMap<Float, List<List<Integer>>> bins = new PlanningBinPacker(
+ weight, groupabaleCols, groupabaleColWeights)
+ .packFirstFit();
+
+ // brute force grouping within each partition
+ for (List<List<Integer>> binList : bins.values()) {
+ for (List<Integer> bin : binList) {
+ // building an array of singleton CoCodingGroup
+ PlanningCoCodingGroup[] singltonGroups = new PlanningCoCodingGroup[bin.size()];
+ int i = 0;
+ GroupableColInfo colInfo;
+ for (Integer col : bin) {
+ colInfo = groupableColsInfo.get(col);
+ singltonGroups[i++] = new PlanningCoCodingGroup(col, colInfo.size,
+ colInfo.cardRatio);
+ }
+ PlanningCoCodingGroup[] outputGroups = findCocodesBruteForce(
+ sizeEstimator, numRowsWeight, singltonGroups);
+
+ for (PlanningCoCodingGroup grp : outputGroups) {
+ retGroups.add(grp.getColIndices());
+ }
+ }
+ }
+ return retGroups;
+ }
+
+ /**
+ * Identify columns to code together. Uses a greedy approach that merges
+ * pairs of column groups into larger groups. Each phase of the greedy
+ * algorithm considers all combinations of pairs to merge.
+ *
+ */
+ private static PlanningCoCodingGroup[] findCocodesBruteForce(
+ CompressedSizeEstimator sizeEstimator, float numRowsWeight,
+ PlanningCoCodingGroup[] singltonGroups)
+ {
+ // Populate a priority queue with all available 2-column cocodings.
+ PriorityQueue<PlanningGroupMergeAction> q = new PriorityQueue<PlanningGroupMergeAction>();
+ for (int leftIx = 0; leftIx < singltonGroups.length; leftIx++) {
+ PlanningCoCodingGroup leftGrp = singltonGroups[leftIx];
+ for (int rightIx = leftIx + 1; rightIx < singltonGroups.length; rightIx++) {
+ PlanningCoCodingGroup rightGrp = singltonGroups[rightIx];
+ // at least one of the two groups should be low-cardinality
+ float cardRatio = leftGrp.getCardinalityRatio() + rightGrp.getCardinalityRatio();
+ if ( cardRatio < GROUPABILITY_THRESHOLD) {
+ PlanningGroupMergeAction potentialMerge = new PlanningGroupMergeAction(
+ sizeEstimator, numRowsWeight, leftGrp, rightGrp);
+ if (potentialMerge.getChangeInSize() < 0) {
+ q.add(potentialMerge);
+ }
+ }
+ }
+ }
+ PlanningCoCodingGroup[] colGroups = singltonGroups;
+
+ // Greedily merge groups until we can no longer reduce the number of
+ // runs by merging groups
+ while (q.size() > 0) {
+ PlanningGroupMergeAction merge = q.poll();
+
+ // The queue can contain merge actions involving column groups that
+ // have already been merged.
+ // Filter those actions out.
+ int leftIx = findInArray(colGroups, merge.getLeftGrp());
+ int rightIx = findInArray(colGroups, merge.getRightGrp());
+ if (leftIx < 0 || rightIx < 0) {
+ // One or more of the groups to be merged has already been made
+ // part of another group.
+ // Drop the merge action.
+ } else {
+ PlanningCoCodingGroup mergedGrp = merge.getMergedGrp();
+
+ PlanningCoCodingGroup[] newColGroups = new PlanningCoCodingGroup[colGroups.length - 1];
+ int targetIx = 0;
+ for (int i = 0; i < colGroups.length; i++) {
+ if (i != leftIx && i != rightIx) {
+ newColGroups[targetIx] = colGroups[i];
+ targetIx++;
+ }
+ }
+
+ // New group goes at the end to (hopefully) speed up future
+ // linear search operations
+ newColGroups[newColGroups.length - 1] = mergedGrp;
+
+ // Consider merging the new group with all the other
+ // pre-existing groups.
+ for (int i = 0; i < newColGroups.length - 1; i++) {
+ PlanningCoCodingGroup newLeftGrp = newColGroups[i];
+ PlanningCoCodingGroup newRightGrp = mergedGrp;
+ if (newLeftGrp.getCardinalityRatio()
+ + newRightGrp.getCardinalityRatio() < GROUPABILITY_THRESHOLD) {
+ PlanningGroupMergeAction newPotentialMerge = new PlanningGroupMergeAction(
+ sizeEstimator, numRowsWeight, newLeftGrp,
+ newRightGrp);
+ if (newPotentialMerge.getChangeInSize() < 0) {
+ q.add(newPotentialMerge);
+ }
+ }
+ }
+ colGroups = newColGroups;
+ }
+ }
+ return colGroups;
+ }
+
+ /**
+ *
+ * @param numRows
+ * @param sparsity
+ * @return
+ */
+ private static float computeWeightForCoCoding(int numRows, double sparsity)
+ {
+ if( USE_BIN_WEIGHT ) { //new method (non-conclusive)
+ //return (float) Math.pow(numRows*sparsity,BIN_WEIGHT_PARAM);
+ return (float) Math.pow(numRows,BIN_WEIGHT_PARAM);
+ }
+ else {
+ return PARTITION_SIZE;
+ }
+ }
+
+ /**
+ *
+ * @param arr
+ * @param val
+ * @return
+ */
+ private static int findInArray(Object[] arr, Object val) {
+ for (int i = 0; i < arr.length; i++) {
+ if (arr[i].equals(val)) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ /**
+ *
+ */
+ private static class GroupableColInfo {
+ float cardRatio;
+ long size;
+
+ public GroupableColInfo(float lcardRatio, long lsize) {
+ cardRatio = lcardRatio;
+ size = lsize;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCodingGroup.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCodingGroup.java b/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCodingGroup.java
new file mode 100644
index 0000000..221e4ca
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/PlanningCoCodingGroup.java
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+
+import org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator;
+import org.apache.sysml.runtime.compress.estim.CompressedSizeInfo;
+
+/**
+ * Class to represent information about co-coding a group of columns.
+ *
+ */
+public class PlanningCoCodingGroup
+{
+ private int[] _colIndexes;
+ private long _estSize;
+ private float _cardRatio;
+
+ /**
+ * Constructor for a one-column group; i.e. do not co-code a given column.
+ *
+ */
+ public PlanningCoCodingGroup(int col, long estSize, float cardRatio) {
+ _colIndexes = new int[]{col};
+ _estSize = estSize;
+ _cardRatio = cardRatio;
+ }
+
+ /**
+ * Constructor for merging two disjoint groups of columns
+ *
+ * @param grp1 first group of columns to merge
+ * @param grp2 second group to merge
+ * @param numRowsWeight numRows x sparsity
+ */
+ public PlanningCoCodingGroup(PlanningCoCodingGroup grp1, PlanningCoCodingGroup grp2,
+ CompressedSizeEstimator bitmapSizeEstimator, float numRowsWeight)
+ {
+ // merge sorted non-empty arrays
+ _colIndexes = new int[grp1._colIndexes.length + grp2._colIndexes.length];
+ int grp1Ptr = 0, grp2Ptr = 0;
+ for (int mergedIx = 0; mergedIx < _colIndexes.length; mergedIx++) {
+ if (grp1._colIndexes[grp1Ptr] < grp2._colIndexes[grp2Ptr]) {
+ _colIndexes[mergedIx] = grp1._colIndexes[grp1Ptr++];
+ if (grp1Ptr == grp1._colIndexes.length) {
+ System.arraycopy(grp2._colIndexes, grp2Ptr, _colIndexes,
+ mergedIx + 1, grp2._colIndexes.length - grp2Ptr);
+ break;
+ }
+ } else {
+ _colIndexes[mergedIx] = grp2._colIndexes[grp2Ptr++];
+ if (grp2Ptr == grp2._colIndexes.length) {
+ System.arraycopy(grp1._colIndexes, grp1Ptr, _colIndexes,
+ mergedIx + 1, grp1._colIndexes.length - grp1Ptr);
+ break;
+ }
+ }
+ }
+
+ // estimating size info
+ CompressedSizeInfo groupSizeInfo = bitmapSizeEstimator
+ .estimateCompressedColGroupSize(_colIndexes);
+ _estSize = groupSizeInfo.getMinSize();
+ _cardRatio = groupSizeInfo.getEstCarinality() / numRowsWeight;
+ }
+
+ public int[] getColIndices() {
+ return _colIndexes;
+ }
+
+ /**
+ * @return estimated compressed size of the grouped columns
+ */
+ public long getEstSize() {
+ return _estSize;
+ }
+
+ public float getCardinalityRatio() {
+ return _cardRatio;
+ }
+
+ @Override
+ public String toString() {
+ return Arrays.toString(_colIndexes);
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/PlanningGroupMergeAction.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/PlanningGroupMergeAction.java b/src/main/java/org/apache/sysml/runtime/compress/PlanningGroupMergeAction.java
new file mode 100644
index 0000000..5e3c6c5
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/PlanningGroupMergeAction.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator;
+
+/**
+ * Internal data structure for tracking potential merges of column groups in
+ * co-coding calculations.
+ *
+ */
+class PlanningGroupMergeAction implements Comparable<PlanningGroupMergeAction>
+{
+ private PlanningCoCodingGroup _leftGrp; //left input
+ private PlanningCoCodingGroup _rightGrp; //right input
+ private PlanningCoCodingGroup _mergedGrp; //output
+ private long _changeInSize;
+
+
+ public PlanningGroupMergeAction(CompressedSizeEstimator sizeEstimator,
+ float numRowsWeight, PlanningCoCodingGroup leftGrp, PlanningCoCodingGroup rightGrp) {
+ _leftGrp = leftGrp;
+ _rightGrp = rightGrp;
+ _mergedGrp = new PlanningCoCodingGroup(leftGrp, rightGrp, sizeEstimator, numRowsWeight);
+
+ // Negative size change ==> Decrease in size
+ _changeInSize = _mergedGrp.getEstSize()
+ - leftGrp.getEstSize() - rightGrp.getEstSize();
+ }
+
+ public int compareTo(PlanningGroupMergeAction o) {
+ // We only sort by the change in size
+ return (int) Math.signum(_changeInSize - o._changeInSize);
+ }
+
+ @Override
+ public String toString() {
+ return String.format("Merge %s and %s", _leftGrp, _rightGrp);
+ }
+
+ public PlanningCoCodingGroup getLeftGrp() {
+ return _leftGrp;
+ }
+
+ public PlanningCoCodingGroup getRightGrp() {
+ return _rightGrp;
+ }
+
+ public PlanningCoCodingGroup getMergedGrp() {
+ return _mergedGrp;
+ }
+
+ public long getChangeInSize() {
+ return _changeInSize;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelection.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelection.java b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelection.java
new file mode 100644
index 0000000..a37018f
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelection.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import org.apache.sysml.runtime.compress.utils.DblArray;
+
+/**
+ * Base class for all column selection readers.
+ *
+ */
+public abstract class ReaderColumnSelection
+{
+ protected int[] _colIndexes = null;
+ protected int _numRows = -1;
+ protected int _lastRow = -1;
+ protected boolean _skipZeros = false;
+
+ protected ReaderColumnSelection(int[] colIndexes, int numRows, boolean skipZeros) {
+ _colIndexes = colIndexes;
+ _numRows = numRows;
+ _lastRow = -1;
+ _skipZeros = skipZeros;
+ }
+
+ /**
+ * Gets the next row, null when no more rows.
+ *
+ * @return
+ */
+ public abstract DblArray nextRow();
+
+ /**
+ *
+ * @return
+ */
+ public int getCurrentRowIndex() {
+ return _lastRow;
+ }
+
+
+ /**
+ * Resets the reader to the first row.
+ */
+ public void reset() {
+ _lastRow = -1;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDense.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDense.java b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDense.java
new file mode 100644
index 0000000..d22f39d
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDense.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import org.apache.sysml.runtime.compress.utils.DblArray;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+public class ReaderColumnSelectionDense extends ReaderColumnSelection
+{
+ protected MatrixBlock _data;
+
+ // reusable return
+ private DblArray nonZeroReturn;
+ private DblArray reusableReturn;
+ private double[] reusableArr;
+
+ public ReaderColumnSelectionDense(MatrixBlock data, int[] colIndices, boolean skipZeros) {
+ super(colIndices, CompressedMatrixBlock.TRANSPOSE_INPUT ?
+ data.getNumColumns() : data.getNumRows(), skipZeros);
+ _data = data;
+ reusableArr = new double[colIndices.length];
+ reusableReturn = new DblArray(reusableArr);
+ }
+
+ @Override
+ public DblArray nextRow() {
+ if( _skipZeros) {
+ while ((nonZeroReturn = getNextRow()) != null
+ && DblArray.isZero(nonZeroReturn));
+ return nonZeroReturn;
+ } else {
+ return getNextRow();
+ }
+ }
+
+ /**
+ *
+ * @return
+ */
+ private DblArray getNextRow() {
+ if(_lastRow == _numRows-1)
+ return null;
+ _lastRow++;
+ for (int i = 0; i < _colIndexes.length; i++) {
+ reusableArr[i] = CompressedMatrixBlock.TRANSPOSE_INPUT ?
+ _data.quickGetValue( _colIndexes[i], _lastRow ) :
+ _data.quickGetValue( _lastRow, _colIndexes[i] );
+ }
+ return reusableReturn;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDenseSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDenseSample.java b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDenseSample.java
new file mode 100644
index 0000000..06518e4
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionDenseSample.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import org.apache.sysml.runtime.compress.utils.DblArray;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+/**
+ *
+ * considers only a subset of row indexes
+ */
+public class ReaderColumnSelectionDenseSample extends ReaderColumnSelection
+{
+ protected MatrixBlock _data;
+
+ private int[] _sampleIndexes;
+ private int lastIndex = -1;
+
+ // reusable return
+ private DblArray nonZeroReturn;
+ private DblArray reusableReturn;
+ private double[] reusableArr;
+
+ public ReaderColumnSelectionDenseSample(MatrixBlock data, int[] colIndexes, int[] sampleIndexes, boolean skipZeros)
+ {
+ super(colIndexes, -1, skipZeros);
+ _data = data;
+ _sampleIndexes = sampleIndexes;
+ reusableArr = new double[colIndexes.length];
+ reusableReturn = new DblArray(reusableArr);
+ }
+
+ @Override
+ public DblArray nextRow() {
+ if (_skipZeros) {
+ while ((nonZeroReturn = getNextRow()) != null
+ && DblArray.isZero(nonZeroReturn));
+ return nonZeroReturn;
+ } else {
+ return getNextRow();
+ }
+ }
+
+ /**
+ *
+ * @return
+ */
+ private DblArray getNextRow() {
+ if (lastIndex == _sampleIndexes.length - 1)
+ return null;
+ lastIndex++;
+ for (int i = 0; i < _colIndexes.length; i++) {
+ reusableArr[i] = CompressedMatrixBlock.TRANSPOSE_INPUT ?
+ _data.quickGetValue(_colIndexes[i], _sampleIndexes[lastIndex]) :
+ _data.quickGetValue(_sampleIndexes[lastIndex], _colIndexes[i]);
+ }
+ return reusableReturn;
+ }
+
+ @Override
+ public int getCurrentRowIndex() {
+ return _sampleIndexes[lastIndex];
+ }
+
+ @Override
+ public void reset() {
+ lastIndex = -1;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionSparse.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionSparse.java b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionSparse.java
new file mode 100644
index 0000000..d2ef5a4
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/ReaderColumnSelectionSparse.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+
+import org.apache.sysml.runtime.compress.utils.DblArray;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.matrix.data.SparseRow;
+
+/**
+ * Used to extract the values at certain indexes from each row in a sparse
+ * matrix
+ *
+ * Keeps returning all-zeros arrays until reaching the last possible index. The
+ * current compression algorithm treats the zero-value in a sparse matrix like
+ * any other value.
+ */
+public class ReaderColumnSelectionSparse extends ReaderColumnSelection
+{
+ private final DblArray ZERO_DBL_ARRAY;
+ private DblArray nonZeroReturn;
+
+ // reusable return
+ private DblArray reusableReturn;
+ private double[] reusableArr;
+
+ // current sparse row positions
+ private SparseRow[] sparseCols = null;
+ private int[] sparsePos = null;
+
+ public ReaderColumnSelectionSparse(MatrixBlock data, int[] colIndexes, boolean skipZeros)
+ {
+ super(colIndexes, CompressedMatrixBlock.TRANSPOSE_INPUT ?
+ data.getNumColumns() : data.getNumRows(), skipZeros);
+ ZERO_DBL_ARRAY = new DblArray(new double[colIndexes.length], true);
+ reusableArr = new double[colIndexes.length];
+ reusableReturn = new DblArray(reusableArr);
+
+ if( !CompressedMatrixBlock.TRANSPOSE_INPUT ){
+ throw new RuntimeException("SparseColumnSelectionReader should not be used without transposed input.");
+ }
+
+ sparseCols = new SparseRow[colIndexes.length];
+ sparsePos = new int[colIndexes.length];
+ if( data.getSparseBlock()!=null )
+ for( int i=0; i<colIndexes.length; i++ )
+ sparseCols[i] = data.getSparseBlock().get(colIndexes[i]);
+ Arrays.fill(sparsePos, 0);
+ }
+
+ @Override
+ public DblArray nextRow() {
+ if(_skipZeros) {
+ while ((nonZeroReturn = getNextRow()) != null
+ && nonZeroReturn == ZERO_DBL_ARRAY);
+ return nonZeroReturn;
+ } else {
+ return getNextRow();
+ }
+ }
+
+ /**
+ *
+ * @return
+ */
+ private DblArray getNextRow()
+ {
+ if(_lastRow == _numRows-1)
+ return null;
+ _lastRow++;
+
+ if( !CompressedMatrixBlock.TRANSPOSE_INPUT ){
+ throw new RuntimeException("SparseColumnSelectionReader should not be used without transposed input.");
+ }
+
+ //move pos to current row if necessary (for all columns)
+ for( int i=0; i<_colIndexes.length; i++ )
+ if( sparseCols[i] != null && (sparseCols[i].indexes().length<=sparsePos[i]
+ || sparseCols[i].indexes()[sparsePos[i]]<_lastRow) )
+ {
+ sparsePos[i]++;
+ }
+
+ //extract current values
+ Arrays.fill(reusableArr, 0);
+ boolean zeroResult = true;
+ for( int i=0; i<_colIndexes.length; i++ )
+ if( sparseCols[i] != null && sparseCols[i].indexes().length>sparsePos[i]
+ &&sparseCols[i].indexes()[sparsePos[i]]==_lastRow )
+ {
+ reusableArr[i] = sparseCols[i].values()[sparsePos[i]];
+ zeroResult = false;
+ }
+
+ return zeroResult ? ZERO_DBL_ARRAY : reusableReturn;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/UncompressedBitmap.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/UncompressedBitmap.java b/src/main/java/org/apache/sysml/runtime/compress/UncompressedBitmap.java
new file mode 100644
index 0000000..971f438
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/UncompressedBitmap.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress;
+
+import java.util.Arrays;
+
+import org.apache.sysml.runtime.compress.utils.DblArrayIntListHashMap;
+import org.apache.sysml.runtime.compress.utils.DoubleIntListHashMap;
+import org.apache.sysml.runtime.compress.utils.DblArrayIntListHashMap.DArrayIListEntry;
+import org.apache.sysml.runtime.compress.utils.DoubleIntListHashMap.DIListEntry;
+
+/**
+ * Uncompressed representation of one or more columns in bitmap format.
+ *
+ */
+public final class UncompressedBitmap
+{
+ private int _numCols;
+
+ /** Distinct values that appear in the column. Linearized as value groups <v11 v12> <v21 v22>.*/
+ private double[] _values;
+
+ /** Bitmaps (as lists of offsets) for each of the values. */
+ private int[][] _offsetsLists;
+
+ public UncompressedBitmap( DblArrayIntListHashMap distinctVals, int numColumns )
+ {
+ // added for one pass bitmap construction
+ // Convert inputs to arrays
+ int numVals = distinctVals.size();
+ _values = new double[numVals*numColumns];
+ _offsetsLists = new int[numVals][];
+ int bitmapIx = 0;
+ for( DArrayIListEntry val : distinctVals.extractValues()) {
+ System.arraycopy(val.key.getData(), 0, _values, bitmapIx*numColumns, numColumns);
+ _offsetsLists[bitmapIx++] = val.value.extractValues();
+ }
+ _numCols = numColumns;
+ }
+
+ public UncompressedBitmap( DoubleIntListHashMap distinctVals )
+ {
+ // added for one pass bitmap construction
+ // Convert inputs to arrays
+ int numVals = distinctVals.size();
+ _values = new double[numVals];
+ _offsetsLists = new int[numVals][];
+ int bitmapIx = 0;
+ for(DIListEntry val : distinctVals.extractValues()) {
+ _values[bitmapIx] = val.key;
+ _offsetsLists[bitmapIx++] = val.value.extractValues();
+ }
+ _numCols = 1;
+ }
+
+ public int getNumColumns() {
+ return _numCols;
+ }
+
+ /**
+ * @param ix index of a particular distinct value
+ * @return the tuple of column values associated with the specified index
+ */
+ public double[] getValues(int ix) {
+ return Arrays.copyOfRange(_values, ix*_numCols, (ix+1)*_numCols);
+ }
+
+ /**
+ * @return number of distinct values in the column; this number is also the
+ * number of bitmaps, since there is one bitmap per value
+ */
+ public int getNumValues() {
+ return _values.length / _numCols;
+ }
+
+ /**
+ * @param ix index of a particular distinct value
+ * @return IMMUTABLE array of the offsets of the rows containing the value
+ * with the indicated index
+ */
+ public int[] getOffsetsList(int ix) {
+ return _offsetsLists[ix];
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimator.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimator.java
new file mode 100644
index 0000000..1a1ae55
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimator.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.estim;
+
+import org.apache.sysml.runtime.compress.BitmapEncoder;
+import org.apache.sysml.runtime.compress.UncompressedBitmap;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+/**
+ * Base class for all compressed size estimators
+ */
+public abstract class CompressedSizeEstimator
+{
+ protected MatrixBlock _data;
+
+ public CompressedSizeEstimator(MatrixBlock data) {
+ _data = data;
+ }
+
+ /**
+ *
+ * @param colIndexes
+ * @return
+ */
+ public abstract CompressedSizeInfo estimateCompressedColGroupSize(int[] colIndexes);
+
+ /**
+ *
+ * @param ubm
+ * @return
+ */
+ public abstract CompressedSizeInfo estimateCompressedColGroupSize(UncompressedBitmap ubm);
+
+ /**
+ *
+ * @param ubm
+ * @param inclRLE
+ * @return
+ */
+ protected SizeEstimationFactors computeSizeEstimationFactors(UncompressedBitmap ubm, boolean inclRLE) {
+ int numVals = ubm.getNumValues();
+ int numRuns = 0;
+ int numOffs = 0;
+ int numSegs = 0;
+ int numSingle = 0;
+
+ //compute size estimation factors
+ for (int i = 0; i < numVals; i++) {
+ int[] list = ubm.getOffsetsList(i);
+ numOffs += list.length;
+ numSegs += list[list.length - 1] / BitmapEncoder.BITMAP_BLOCK_SZ + 1;
+ numSingle += (list.length==1) ? 1 : 0;
+ if( inclRLE ) {
+ int lastOff = -2;
+ for (int j = 0; j < list.length; j++) {
+ if (list[j] != lastOff + 1)
+ numRuns++;
+ lastOff = list[j];
+ }
+ }
+ }
+
+ //construct estimation factors
+ return new SizeEstimationFactors(numVals, numSegs, numOffs, numRuns, numSingle);
+ }
+
+ /**
+ * Estimates the number of bytes needed to encode this column group
+ * in RLE encoding format.
+ *
+ * @param numVals
+ * @param numRuns
+ * @param numCols
+ * @return
+ */
+ protected static long getRLESize(int numVals, int numRuns, int numCols) {
+ int ret = 0;
+ //distinct value tuples [double per col]
+ ret += 8 * numVals * numCols;
+ //offset/len fields per distinct value tuple [2xint]
+ ret += 8 * numVals;
+ //run data [2xchar]
+ ret += 4 * numRuns;
+ return ret;
+ }
+
+ /**
+ * Estimates the number of bytes needed to encode this column group
+ * in OLE format.
+ *
+ * @param numVals
+ * @param numOffs
+ * @param numSeqs
+ * @param numCols
+ * @return
+ */
+ protected static long getOLESize(int numVals, float numOffs, int numSeqs, int numCols) {
+ int ret = 0;
+ //distinct value tuples [double per col]
+ ret += 8 * numVals * numCols;
+ //offset/len fields per distinct value tuple [2xint]
+ ret += 8 * numVals;
+ //offset list data [1xchar]
+ ret += 2 * numOffs;
+ //offset list seqment headers [1xchar]
+ ret += 2 * numSeqs;
+ return ret;
+ }
+
+ /**
+ *
+ */
+ protected static class SizeEstimationFactors {
+ protected int numVals; //num value tuples
+ protected int numSegs; //num OLE segments
+ protected int numOffs; //num OLE offsets
+ protected int numRuns; //num RLE runs
+ protected int numSingle; //num singletons
+
+ protected SizeEstimationFactors(int numvals, int numsegs, int numoffs, int numruns, int numsingle) {
+ numVals = numvals;
+ numSegs = numsegs;
+ numOffs = numoffs;
+ numRuns = numruns;
+ numSingle = numsingle;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorExact.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorExact.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorExact.java
new file mode 100644
index 0000000..557c518
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorExact.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.estim;
+
+import org.apache.sysml.runtime.compress.BitmapEncoder;
+import org.apache.sysml.runtime.compress.UncompressedBitmap;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+/**
+ * Exact compressed size estimator (examines entire dataset).
+ *
+ */
+public class CompressedSizeEstimatorExact extends CompressedSizeEstimator
+{
+ public CompressedSizeEstimatorExact(MatrixBlock data) {
+ super(data);
+ }
+
+ @Override
+ public CompressedSizeInfo estimateCompressedColGroupSize(int[] colIndexes) {
+ return estimateCompressedColGroupSize(
+ BitmapEncoder.extractBitmap(colIndexes, _data));
+ }
+
+ @Override
+ public CompressedSizeInfo estimateCompressedColGroupSize(UncompressedBitmap ubm)
+ {
+ //compute size estimation factors
+ SizeEstimationFactors fact = computeSizeEstimationFactors(ubm, true);
+
+ //construct new size info summary
+ return new CompressedSizeInfo(fact.numVals,
+ getRLESize(fact.numVals, fact.numRuns, ubm.getNumColumns()),
+ getOLESize(fact.numVals, fact.numOffs, fact.numSegs, ubm.getNumColumns()));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
new file mode 100644
index 0000000..76a0f06
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeEstimatorSample.java
@@ -0,0 +1,767 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.estim;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import org.apache.commons.math3.distribution.ChiSquaredDistribution;
+import org.apache.commons.math3.random.RandomDataGenerator;
+import org.apache.sysml.hops.OptimizerUtils;
+import org.apache.sysml.runtime.compress.BitmapEncoder;
+import org.apache.sysml.runtime.compress.ReaderColumnSelection;
+import org.apache.sysml.runtime.compress.CompressedMatrixBlock;
+import org.apache.sysml.runtime.compress.ReaderColumnSelectionDense;
+import org.apache.sysml.runtime.compress.ReaderColumnSelectionDenseSample;
+import org.apache.sysml.runtime.compress.ReaderColumnSelectionSparse;
+import org.apache.sysml.runtime.compress.UncompressedBitmap;
+import org.apache.sysml.runtime.compress.utils.DblArray;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+public class CompressedSizeEstimatorSample extends CompressedSizeEstimator
+{
+ private static final boolean CORRECT_NONZERO_ESTIMATE = false; //TODO enable for production
+ private final static double SHLOSSER_JACKKNIFE_ALPHA = 0.975;
+ public static final float HAAS_AND_STOKES_ALPHA1 = 0.9F; //0.9 recommended in paper
+ public static final float HAAS_AND_STOKES_ALPHA2 = 30F; //30 recommended in paper
+ public static final float HAAS_AND_STOKES_UJ2A_C = 50; //50 recommend in paper
+
+ private int[] _sampleRows = null;
+ private RandomDataGenerator _rng = null;
+ private int _numRows = -1;
+
+ /**
+ *
+ * @param data
+ * @param sampleRows
+ */
+ public CompressedSizeEstimatorSample(MatrixBlock data, int[] sampleRows) {
+ super(data);
+ _sampleRows = sampleRows;
+ _rng = new RandomDataGenerator();
+ _numRows = CompressedMatrixBlock.TRANSPOSE_INPUT ?
+ _data.getNumColumns() : _data.getNumRows();
+ }
+
+ /**
+ *
+ * @param mb
+ * @param sampleSize
+ */
+ public CompressedSizeEstimatorSample(MatrixBlock mb, int sampleSize) {
+ this(mb, null);
+ _sampleRows = getSortedUniformSample(_numRows, sampleSize);
+ }
+
+ /**
+ *
+ * @param sampleRows, assumed to be sorted
+ */
+ public void setSampleRows(int[] sampleRows) {
+ _sampleRows = sampleRows;
+ }
+
+ /**
+ *
+ * @param sampleSize
+ */
+ public void resampleRows(int sampleSize) {
+ _sampleRows = getSortedUniformSample(_numRows, sampleSize);
+ }
+
+ @Override
+ public CompressedSizeInfo estimateCompressedColGroupSize(int[] colIndexes)
+ {
+ //extract statistics from sample
+ UncompressedBitmap ubm = BitmapEncoder.extractBitmapFromSample(
+ colIndexes, _data, _sampleRows);
+ SizeEstimationFactors fact = computeSizeEstimationFactors(ubm, false);
+
+ //estimate number of distinct values
+ int totalCardinality = getNumDistinctValues(colIndexes);
+ totalCardinality = Math.max(totalCardinality, fact.numVals); //fix anomalies w/ large sample fraction
+ totalCardinality = Math.min(totalCardinality, _numRows); //fix anomalies w/ large sample fraction
+
+ //estimate unseen values
+ // each unseen is assumed to occur only once (it did not show up in the sample because it is rare)
+ int unseen = Math.max(0, totalCardinality - fact.numVals);
+ int sampleSize = _sampleRows.length;
+
+ //estimate number of offsets
+ double sparsity = OptimizerUtils.getSparsity(
+ _data.getNumRows(), _data.getNumColumns(), _data.getNonZeros());
+
+ // expected value given that we don't store the zero values
+ float totalNumOffs = (float) (_numRows * (1 - Math.pow(1 - sparsity,colIndexes.length)));
+ if( CORRECT_NONZERO_ESTIMATE ) {
+ long numZeros = sampleSize - fact.numOffs;
+ float C = Math.max(1-(float)fact.numSingle/sampleSize, (float)sampleSize/_numRows);
+ totalNumOffs = _numRows - ((numZeros>0)? (float)_numRows/sampleSize*C*numZeros : 0);
+ }
+
+ // For a single offset, the number of blocks depends on the value of
+ // that offset. small offsets (first group of rows in the matrix)
+ // require a small number of blocks and large offsets (last group of
+ // rows) require a large number of blocks. The unseen offsets are
+ // distributed over the entire offset range. A reasonable and fast
+ // estimate for the number of blocks is to use the arithmetic mean of
+ // the number of blocks used for the first index (=1) and that of the
+ // last index.
+ int numUnseenSeg = Math.round(unseen
+ * (2.0f * BitmapEncoder.BITMAP_BLOCK_SZ + _numRows) / 2
+ / BitmapEncoder.BITMAP_BLOCK_SZ);
+ int totalNumSeg = fact.numSegs + numUnseenSeg;
+ int totalNumRuns = getNumRuns(ubm, sampleSize, _numRows) + unseen;
+
+ //construct new size info summary
+ return new CompressedSizeInfo(totalCardinality,
+ getRLESize(totalCardinality, totalNumRuns, colIndexes.length),
+ getOLESize(totalCardinality, totalNumOffs, totalNumSeg, colIndexes.length));
+ }
+
+ @Override
+ public CompressedSizeInfo estimateCompressedColGroupSize(UncompressedBitmap ubm)
+ {
+ //compute size estimation factors
+ SizeEstimationFactors fact = computeSizeEstimationFactors(ubm, true);
+
+ //construct new size info summary
+ return new CompressedSizeInfo(fact.numVals,
+ getRLESize(fact.numVals, fact.numRuns, ubm.getNumColumns()),
+ getOLESize(fact.numVals, fact.numOffs, fact.numSegs, ubm.getNumColumns()));
+ }
+
+ /**
+ *
+ * @param colIndexes
+ * @return
+ */
+ private int getNumDistinctValues(int[] colIndexes) {
+ return haasAndStokes(colIndexes);
+ }
+
+ /**
+ *
+ * @param sampleUncompressedBitmap
+ * @param sampleSize
+ * @param totalNumRows
+ * @return
+ */
+ private int getNumRuns(UncompressedBitmap sampleUncompressedBitmap,
+ int sampleSize, int totalNumRows) {
+ int numVals = sampleUncompressedBitmap.getNumValues();
+ // all values in the sample are zeros
+ if (numVals == 0)
+ return 0;
+ float numRuns = 0;
+ for (int vi = 0; vi < numVals; vi++) {
+ int[] offsets = sampleUncompressedBitmap.getOffsetsList(vi);
+ float offsetsRatio = ((float) offsets.length) / sampleSize;
+ float avgAdditionalOffsets = offsetsRatio * totalNumRows
+ / sampleSize;
+ if (avgAdditionalOffsets < 1) {
+ // Ising-Stevens does not hold
+ // fall-back to using the expected number of offsets as an upper
+ // bound on the number of runs
+ numRuns += ((float) offsets.length) * totalNumRows / sampleSize;
+ continue;
+ }
+ int intervalEnd, intervalSize;
+ float additionalOffsets;
+ // probability of an index being non-offset in current and previous
+ // interval respectively
+ float nonOffsetProb, prevNonOffsetProb = 1;
+ boolean reachedSampleEnd = false;
+ // handling the first interval separately for simplicity
+ int intervalStart = -1;
+ if (_sampleRows[0] == 0) {
+ // empty interval
+ intervalStart = 0;
+ } else {
+ intervalEnd = _sampleRows[0];
+ intervalSize = intervalEnd - intervalStart - 1;
+ // expected value of a multivariate hypergeometric distribution
+ additionalOffsets = offsetsRatio * intervalSize;
+ // expected value of an Ising-Stevens distribution
+ numRuns += (intervalSize - additionalOffsets)
+ * additionalOffsets / intervalSize;
+ intervalStart = intervalEnd;
+ prevNonOffsetProb = (intervalSize - additionalOffsets)
+ / intervalSize;
+ }
+ // for handling separators
+
+ int withinSepRun = 0;
+ boolean seenNonOffset = false, startedWithOffset = false, endedWithOffset = false;
+ int offsetsPtrs = 0;
+ for (int ix = 1; ix < sampleSize; ix++) {
+ // start of a new separator
+ // intervalStart will always be pointing at the current value
+ // in the separator block
+
+ if (offsetsPtrs < offsets.length
+ && offsets[offsetsPtrs] == intervalStart) {
+ startedWithOffset = true;
+ offsetsPtrs++;
+ endedWithOffset = true;
+ } else {
+ seenNonOffset = true;
+ endedWithOffset = false;
+ }
+ while (intervalStart + 1 == _sampleRows[ix]) {
+ intervalStart = _sampleRows[ix];
+ if (seenNonOffset) {
+ if (offsetsPtrs < offsets.length
+ && offsets[offsetsPtrs] == intervalStart) {
+ withinSepRun = 1;
+ offsetsPtrs++;
+ endedWithOffset = true;
+ } else {
+ numRuns += withinSepRun;
+ withinSepRun = 0;
+ endedWithOffset = false;
+ }
+ } else if (offsetsPtrs < offsets.length
+ && offsets[offsetsPtrs] == intervalStart) {
+ offsetsPtrs++;
+ endedWithOffset = true;
+ } else {
+ seenNonOffset = true;
+ endedWithOffset = false;
+ }
+ //
+ ix++;
+ if (ix == sampleSize) {
+ // end of sample which searching for a start
+ reachedSampleEnd = true;
+ break;
+ }
+ }
+
+ // runs within an interval of unknowns
+ if (reachedSampleEnd)
+ break;
+ intervalEnd = _sampleRows[ix];
+ intervalSize = intervalEnd - intervalStart - 1;
+ // expected value of a multivariate hypergeometric distribution
+ additionalOffsets = offsetsRatio * intervalSize;
+ // expected value of an Ising-Stevens distribution
+ numRuns += (intervalSize - additionalOffsets)
+ * additionalOffsets / intervalSize;
+ nonOffsetProb = (intervalSize - additionalOffsets)
+ / intervalSize;
+
+ // additional runs resulting from x's on the boundaries of the
+ // separators
+ // endedWithOffset = findInArray(offsets, intervalStart) != -1;
+ if (seenNonOffset) {
+ if (startedWithOffset) {
+ // add p(y in the previous interval)
+ numRuns += prevNonOffsetProb;
+ }
+ if (endedWithOffset) {
+ // add p(y in the current interval)
+ numRuns += nonOffsetProb;
+ }
+ } else {
+ // add p(y in the previous interval and y in the current
+ // interval)
+ numRuns += prevNonOffsetProb * nonOffsetProb;
+ }
+ prevNonOffsetProb = nonOffsetProb;
+ intervalStart = intervalEnd;
+ // reseting separator variables
+ seenNonOffset = startedWithOffset = endedWithOffset = false;
+ withinSepRun = 0;
+
+ }
+ // last possible interval
+ if (intervalStart != totalNumRows - 1) {
+ intervalEnd = totalNumRows;
+ intervalSize = intervalEnd - intervalStart - 1;
+ // expected value of a multivariate hypergeometric distribution
+ additionalOffsets = offsetsRatio * intervalSize;
+ // expected value of an Ising-Stevens distribution
+ numRuns += (intervalSize - additionalOffsets)
+ * additionalOffsets / intervalSize;
+ nonOffsetProb = (intervalSize - additionalOffsets)
+ / intervalSize;
+ } else {
+ nonOffsetProb = 1;
+ }
+ // additional runs resulting from x's on the boundaries of the
+ // separators
+ endedWithOffset = intervalStart == offsets[offsets.length - 1];
+ if (seenNonOffset) {
+ if (startedWithOffset) {
+ numRuns += prevNonOffsetProb;
+ }
+ if (endedWithOffset) {
+ // add p(y in the current interval)
+ numRuns += nonOffsetProb;
+ }
+ } else {
+ if (endedWithOffset)
+ // add p(y in the previous interval and y in the current
+ // interval)
+ numRuns += prevNonOffsetProb * nonOffsetProb;
+ }
+ }
+ return Math.round(numRuns);
+ }
+
+ /**
+ *
+ * @param colIndexes
+ * @return
+ */
+ private int haasAndStokes(int[] colIndexes) {
+ ReaderColumnSelection reader = new ReaderColumnSelectionDenseSample(_data,
+ colIndexes, _sampleRows, !CompressedMatrixBlock.MATERIALIZE_ZEROS);
+ return haasAndStokes(_numRows, _sampleRows.length, reader);
+ }
+
+ /**
+ * TODO remove, just for local debugging.
+ *
+ * @param colIndexes
+ * @return
+ */
+ @SuppressWarnings("unused")
+ private int getExactNumDistinctValues(int[] colIndexes) {
+ HashSet<DblArray> distinctVals = new HashSet<DblArray>();
+ ReaderColumnSelection reader = (_data.isInSparseFormat() && CompressedMatrixBlock.TRANSPOSE_INPUT) ?
+ new ReaderColumnSelectionSparse(_data, colIndexes, !CompressedMatrixBlock.MATERIALIZE_ZEROS) :
+ new ReaderColumnSelectionDense(_data, colIndexes, !CompressedMatrixBlock.MATERIALIZE_ZEROS);
+ DblArray val = null;
+ while (null != (val = reader.nextRow()))
+ distinctVals.add(val);
+ return distinctVals.size();
+ }
+
+ /**
+ * Returns a sorted array of n integers, drawn uniformly from the range [0,range).
+ *
+ * @param range
+ * @param smplSize
+ * @return
+ */
+ private int[] getSortedUniformSample(int range, int smplSize) {
+ if (smplSize == 0)
+ return new int[] {};
+ int[] sample = _rng.nextPermutation(range, smplSize);
+ Arrays.sort(sample);
+ return sample;
+ }
+
+
+ /////////////////////////////////////////////////////
+ // Sample Cardinality Estimator library
+ /////////////////////////////////////////
+
+ /**
+ * M. Charikar, S. Chaudhuri, R. Motwani, and V. R. Narasayya, Towards
+ * estimation error guarantees for distinct values, PODS'00.
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * : a reader for the sampled rows
+ * @return
+ */
+ @SuppressWarnings("unused")
+ private static int guaranteedErrorEstimator(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader) {
+ HashMap<DblArray, Integer> valsCount = getValCounts(sampleRowsReader);
+ // number of values that occur only once
+ int singltonValsCount = 0;
+ int otherValsCount = 0;
+ for (Integer c : valsCount.values()) {
+ if (c == 1)
+ singltonValsCount++;
+ else
+ otherValsCount++;
+ }
+ return (int) Math.round(otherValsCount + singltonValsCount
+ * Math.sqrt(((double) nRows) / sampleSize));
+ }
+
+ /**
+ * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes.
+ * Sampling-Based Estimation of the Number of Distinct Values of an
+ * Attribute. VLDB'95, Section 3.2.
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * @return
+ */
+ @SuppressWarnings("unused")
+ private static int shlosserEstimator(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader)
+ {
+ return shlosserEstimator(nRows, sampleSize, sampleRowsReader,
+ getValCounts(sampleRowsReader));
+ }
+
+ /**
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * @param valsCount
+ * @return
+ */
+ private static int shlosserEstimator(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader,
+ HashMap<DblArray, Integer> valsCount)
+ {
+ double q = ((double) sampleSize) / nRows;
+ double oneMinusQ = 1 - q;
+
+ int[] freqCounts = getFreqCounts(valsCount);
+
+ double numerSum = 0, denomSum = 0;
+ int iPlusOne = 1;
+ for (int i = 0; i < freqCounts.length; i++, iPlusOne++) {
+ numerSum += Math.pow(oneMinusQ, iPlusOne) * freqCounts[i];
+ denomSum += iPlusOne * q * Math.pow(oneMinusQ, i) * freqCounts[i];
+ }
+ int estimate = (int) Math.round(valsCount.size() + freqCounts[0]
+ * numerSum / denomSum);
+ return estimate < 1 ? 1 : estimate;
+ }
+
+ /**
+ * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes.
+ * Sampling-Based Estimation of the Number of Distinct Values of an
+ * Attribute. VLDB'95, Section 4.3.
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * @return
+ */
+ @SuppressWarnings("unused")
+ private static int smoothedJackknifeEstimator(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader)
+ {
+ return smoothedJackknifeEstimator(nRows, sampleSize, sampleRowsReader,
+ getValCounts(sampleRowsReader));
+ }
+
+ /**
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * @param valsCount
+ * @return
+ */
+ private static int smoothedJackknifeEstimator(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader,
+ HashMap<DblArray, Integer> valsCount)
+ {
+ int[] freqCounts = getFreqCounts(valsCount);
+ // all values in the sample are zeros
+ if (freqCounts.length == 0)
+ return 0;
+ // nRows is N and sampleSize is n
+
+ int d = valsCount.size();
+ double f1 = freqCounts[0];
+ int Nn = nRows * sampleSize;
+ double D0 = (d - f1 / sampleSize)
+ / (1 - (nRows - sampleSize + 1) * f1 / Nn);
+ double NTilde = nRows / D0;
+ /*-
+ *
+ * h (as defined in eq. 5 in the paper) can be implemented as:
+ *
+ * double h = Gamma(nRows - NTilde + 1) x Gamma.gamma(nRows -sampleSize + 1)
+ * ----------------------------------------------------------------
+ * Gamma.gamma(nRows - sampleSize - NTilde + 1) x Gamma.gamma(nRows + 1)
+ *
+ *
+ * However, for large values of nRows, Gamma.gamma returns NAN
+ * (factorial of a very large number).
+ *
+ * The following implementation solves this problem by levaraging the
+ * cancelations that show up when expanding the factorials in the
+ * numerator and the denominator.
+ *
+ *
+ * min(A,D-1) x [min(A,D-1) -1] x .... x B
+ * h = -------------------------------------------
+ * C x [C-1] x .... x max(A+1,D)
+ *
+ * where A = N-\tilde{N}
+ * B = N-\tilde{N} - n + a
+ * C = N
+ * D = N-n+1
+ *
+ *
+ *
+ */
+ double A = (int) nRows - NTilde;
+ double B = A - sampleSize + 1;
+ double C = nRows;
+ double D = nRows - sampleSize + 1;
+ A = Math.min(A, D - 1);
+ D = Math.max(A + 1, D);
+ double h = 1;
+
+ for (; A >= B || C >= D; A--, C--) {
+ if (A >= B)
+ h *= A;
+ if (C >= D)
+ h /= C;
+ }
+ // end of h computation
+
+ double g = 0, gamma = 0;
+ // k here corresponds to k+1 in the paper (the +1 comes from replacing n
+ // with n-1)
+ for (int k = 2; k <= sampleSize + 1; k++) {
+ g += 1.0 / (nRows - NTilde - sampleSize + k);
+ }
+ for (int i = 1; i <= freqCounts.length; i++) {
+ gamma += i * (i - 1) * freqCounts[i - 1];
+ }
+ gamma *= (nRows - 1) * D0 / Nn / (sampleSize - 1);
+ gamma += D0 / nRows - 1;
+
+ double estimate = (d + nRows * h * g * gamma)
+ / (1 - (nRows - NTilde - sampleSize + 1) * f1 / Nn);
+ return estimate < 1 ? 1 : (int) Math.round(estimate);
+ }
+
+ /**
+ * Peter J. Haas, Jeffrey F. Naughton, S. Seshadri, and Lynne Stokes. 1995.
+ * Sampling-Based Estimation of the Number of Distinct Values of an
+ * Attribute. VLDB'95, Section 5.2, recommended estimator by the authors
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * @return
+ */
+ @SuppressWarnings("unused")
+ private static int shlosserJackknifeEstimator(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader) {
+ HashMap<DblArray, Integer> valsCount = getValCounts(sampleRowsReader);
+
+ // uniformity chi-square test
+ double nBar = ((double) sampleSize) / valsCount.size();
+ // test-statistic
+ double u = 0;
+ for (int cnt : valsCount.values()) {
+ u += Math.pow(cnt - nBar, 2);
+ }
+ u /= nBar;
+ if (sampleSize != usedSampleSize)
+ computeCriticalValue(sampleSize);
+ if (u < uniformityCriticalValue) {
+ // uniform
+ return smoothedJackknifeEstimator(nRows, sampleSize,
+ sampleRowsReader, valsCount);
+ } else {
+ return shlosserEstimator(nRows, sampleSize, sampleRowsReader,
+ valsCount);
+ }
+ }
+
+ /*
+ * In the shlosserSmoothedJackknifeEstimator as long as the sample size did
+ * not change, we will have the same critical value each time the estimator
+ * is used (given that alpha is the same). We cache the critical value to
+ * avoid recomputing it in each call.
+ */
+ private static double uniformityCriticalValue;
+ private static int usedSampleSize;
+
+ private static void computeCriticalValue(int sampleSize) {
+ ChiSquaredDistribution chiSqr = new ChiSquaredDistribution(sampleSize - 1);
+ uniformityCriticalValue = chiSqr.inverseCumulativeProbability(SHLOSSER_JACKKNIFE_ALPHA);
+ usedSampleSize = sampleSize;
+ }
+
+ /**
+ * Haas, Peter J., and Lynne Stokes.
+ * "Estimating the number of classes in a finite population." Journal of the
+ * American Statistical Association 93.444 (1998): 1475-1487.
+ *
+ * The hybrid estimator given by Eq. 33 in Section 6
+ *
+ * @param nRows
+ * @param sampleSize
+ * @param sampleRowsReader
+ * @return
+ */
+ private static int haasAndStokes(int nRows, int sampleSize,
+ ReaderColumnSelection sampleRowsReader)
+ {
+ HashMap<DblArray, Integer> valsCount = getValCounts(sampleRowsReader);
+ // all values in the sample are zeros.
+ if (valsCount.size() == 0)
+ return 1;
+ int[] freqCounts = getFreqCounts(valsCount);
+ float q = ((float) sampleSize) / nRows;
+ float _1MinusQ = 1 - q;
+ // Eq. 11
+ float duj1Fraction = ((float) sampleSize)
+ / (sampleSize - _1MinusQ * freqCounts[0]);
+ float duj1 = duj1Fraction * valsCount.size();
+ // Eq. 16
+ float gamma = 0;
+ for (int i = 1; i <= freqCounts.length; i++) {
+ gamma += i * (i - 1) * freqCounts[i - 1];
+ }
+ gamma *= duj1 / sampleSize / sampleSize;
+ gamma += duj1 / nRows - 1;
+ gamma = Math.max(gamma, 0);
+ int estimate;
+
+ if (gamma < HAAS_AND_STOKES_ALPHA1) {
+ // UJ2 - begining of page 1479
+ // System.out.println("uj2");
+ estimate = (int) (duj1Fraction * (valsCount.size() - freqCounts[0]
+ * _1MinusQ * Math.log(_1MinusQ) * gamma / q));
+ } else if (gamma < HAAS_AND_STOKES_ALPHA2) {
+ // UJ2a - end of page 1998
+ //System.out.println("uj2a");
+ int numRemovedClasses = 0;
+ float updatedNumRows = nRows;
+ int updatedSampleSize = sampleSize;
+
+ for (Integer cnt : valsCount.values()) {
+ if (cnt > HAAS_AND_STOKES_UJ2A_C) {
+ numRemovedClasses++;
+ freqCounts[cnt - 1]--;
+ updatedSampleSize -= cnt;
+ /*
+ * To avoid solving Eq. 20 numerically for the class size in
+ * the full population (N_j), the current implementation
+ * just scales cnt (n_j) by the sampling ratio (q).
+ * Intuitively, the scaling should be fine since cnt is
+ * large enough. Also, N_j in Eq. 20 is lower-bounded by cnt
+ * which is already large enough to make the denominator in
+ * Eq. 20 very close to 1.
+ */
+ updatedNumRows -= ((float) cnt) / q;
+ }
+ }
+ if (updatedSampleSize == 0) {
+ // use uJ2a
+
+ estimate = (int) (duj1Fraction * (valsCount.size() - freqCounts[0]
+ * (_1MinusQ) * Math.log(_1MinusQ) * gamma / q));
+ } else {
+ float updatedQ = ((float) updatedSampleSize) / updatedNumRows;
+ int updatedSampleCardinality = valsCount.size()
+ - numRemovedClasses;
+ float updatedDuj1Fraction = ((float) updatedSampleSize)
+ / (updatedSampleSize - (1 - updatedQ) * freqCounts[0]);
+ float updatedDuj1 = updatedDuj1Fraction
+ * updatedSampleCardinality;
+ float updatedGamma = 0;
+ for (int i = 1; i <= freqCounts.length; i++) {
+ updatedGamma += i * (i - 1) * freqCounts[i - 1];
+ }
+ updatedGamma *= updatedDuj1 / updatedSampleSize
+ / updatedSampleSize;
+ updatedGamma += updatedDuj1 / updatedNumRows - 1;
+ updatedGamma = Math.max(updatedGamma, 0);
+
+ estimate = (int) (updatedDuj1Fraction * (updatedSampleCardinality - freqCounts[0]
+ * (1 - updatedQ)
+ * Math.log(1 - updatedQ)
+ * updatedGamma / updatedQ))
+ + numRemovedClasses;
+ }
+
+ } else {
+ // Sh3 - end of section 3
+ float fraq1Numer = 0;
+ float fraq1Denom = 0;
+ float fraq2Numer = 0;
+ float fraq2Denom = 0;
+ for (int i = 1; i <= freqCounts.length; i++) {
+ fraq1Numer += i * q * q * Math.pow(1 - q * q, i - 1)
+ * freqCounts[i - 1];
+ fraq1Denom += Math.pow(_1MinusQ, i) * (Math.pow(1 + q, i) - 1)
+ * freqCounts[i - 1];
+ fraq2Numer += Math.pow(_1MinusQ, i) * freqCounts[i - 1];
+ fraq2Denom += i * q * Math.pow(_1MinusQ, i - 1)
+ * freqCounts[i - 1];
+ }
+ estimate = (int) (valsCount.size() + freqCounts[0] * fraq1Numer
+ / fraq1Denom * fraq2Numer * fraq2Numer / fraq2Denom
+ / fraq2Denom);
+ }
+ return estimate < 1 ? 1 : estimate;
+ }
+
+ /**
+ *
+ * @param sampleRowsReader
+ * @return
+ */
+ private static HashMap<DblArray, Integer> getValCounts(
+ ReaderColumnSelection sampleRowsReader)
+ {
+ HashMap<DblArray, Integer> valsCount = new HashMap<DblArray, Integer>();
+ DblArray val = null;
+ Integer cnt;
+ while (null != (val = sampleRowsReader.nextRow())) {
+ cnt = valsCount.get(val);
+ if (cnt == null)
+ cnt = 0;
+ cnt++;
+ valsCount.put(val, cnt);
+ }
+ return valsCount;
+ }
+
+ /**
+ *
+ * @param valsCount
+ * @return
+ */
+ private static int[] getFreqCounts(HashMap<DblArray, Integer> valsCount)
+ {
+ int maxCount = 0;
+ for (Integer c : valsCount.values()) {
+ if (c > maxCount)
+ maxCount = c;
+ }
+
+ /*
+ * freqCounts[i-1] = how many values occured with a frequecy i
+ */
+ int[] freqCounts = new int[maxCount];
+ for (Integer c : valsCount.values()) {
+ freqCounts[c - 1]++;
+ }
+ return freqCounts;
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeInfo.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeInfo.java b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeInfo.java
new file mode 100644
index 0000000..834483e
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/CompressedSizeInfo.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.estim;
+
+/**
+ *
+ * A helper reusable object for maintaining bitmap sizes
+ */
+public class CompressedSizeInfo
+{
+ private int _estCard = -1;
+ private long _rleSize = -1;
+ private long _oleSize = -1;
+
+ public CompressedSizeInfo() {
+
+ }
+
+ public CompressedSizeInfo(int estCard, long rleSize, long oleSize) {
+ _estCard = estCard;
+ _rleSize = rleSize;
+ _oleSize = oleSize;
+ }
+
+ public void setRLESize(long rleSize) {
+ _rleSize = rleSize;
+ }
+
+ public long getRLESize() {
+ return _rleSize;
+ }
+
+ public void setOLESize(long oleSize) {
+ _oleSize = oleSize;
+ }
+
+ public long getOLESize() {
+ return _oleSize;
+ }
+
+ public long getMinSize() {
+ return Math.min(_rleSize, _oleSize);
+ }
+
+ public void setEstCardinality(int estCard) {
+ _estCard = estCard;
+ }
+
+ public int getEstCarinality() {
+ return _estCard;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/estim/SizeEstimatorFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/estim/SizeEstimatorFactory.java b/src/main/java/org/apache/sysml/runtime/compress/estim/SizeEstimatorFactory.java
new file mode 100644
index 0000000..f857b5b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/estim/SizeEstimatorFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.estim;
+
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+
+public class SizeEstimatorFactory
+{
+ public static final float SAMPLING_RATIO = 0.01f; //conservative default
+
+ /**
+ *
+ * @param data
+ * @param numRows
+ * @return
+ */
+ @SuppressWarnings("unused")
+ public static CompressedSizeEstimator getSizeEstimator(MatrixBlock data, int numRows) {
+ return (SAMPLING_RATIO == 1.0) ?
+ new CompressedSizeEstimatorExact(data):
+ new CompressedSizeEstimatorSample(data, (int) (numRows*SAMPLING_RATIO));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/utils/ConverterUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/ConverterUtils.java b/src/main/java/org/apache/sysml/runtime/compress/utils/ConverterUtils.java
new file mode 100644
index 0000000..e87ac29
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/utils/ConverterUtils.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.utils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+
+import org.apache.sysml.runtime.compress.ColGroup;
+import org.apache.sysml.runtime.compress.ColGroupOLE;
+import org.apache.sysml.runtime.compress.ColGroupRLE;
+import org.apache.sysml.runtime.compress.ColGroupUncompressed;
+import org.apache.sysml.runtime.matrix.data.MatrixBlock;
+import org.apache.sysml.runtime.util.DataConverter;
+
+public class ConverterUtils
+{
+ /**
+ * Copy col group instance with deep copy of column indices but
+ * shallow copy of actual contents;
+ *
+ * @param group
+ * @return
+ */
+ public static ColGroup copyColGroup(ColGroup group)
+ {
+ ColGroup ret = null;
+
+ //deep copy col indices
+ int[] colIndices = Arrays.copyOf(group.getColIndices(), group.getNumCols());
+
+ //create copy of column group
+ if( group instanceof ColGroupUncompressed ) {
+ ColGroupUncompressed in = (ColGroupUncompressed)group;
+ ret = new ColGroupUncompressed(colIndices, in.getNumRows(), in.getData());
+ }
+ else if( group instanceof ColGroupRLE ) {
+ ColGroupRLE in = (ColGroupRLE)group;
+ ret = new ColGroupRLE(colIndices, in.getNumRows(), in.getValues(),
+ in.getBitmaps(), in.getBitmapOffsets());
+ }
+ else if( group instanceof ColGroupOLE ) {
+ ColGroupOLE in = (ColGroupOLE) group;
+ ret = new ColGroupOLE(colIndices, in.getNumRows(), in.getValues(),
+ in.getBitmaps(), in.getBitmapOffsets());
+ }
+
+ return ret;
+ }
+
+ /**
+ *
+ * @param vector
+ * @return
+ */
+ public static double[] getDenseVector( MatrixBlock vector )
+ {
+ if( vector.isInSparseFormat() )
+ return DataConverter.convertToDoubleVector(vector);
+ else
+ return vector.getDenseBlock();
+ }
+
+ /**
+ *
+ * @param group
+ * @return
+ */
+ public static MatrixBlock getUncompressedColBlock( ColGroup group )
+ {
+ MatrixBlock ret = null;
+ if( group instanceof ColGroupUncompressed ) {
+ ret = ((ColGroupUncompressed) group).getData();
+ }
+ else {
+ ArrayList<ColGroup> tmpGroup = new ArrayList<ColGroup>(Arrays.asList(group));
+ ColGroupUncompressed decompressedCols = new ColGroupUncompressed(tmpGroup);
+ ret = decompressedCols.getData();
+ }
+
+ return ret;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/utils/DblArray.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/DblArray.java b/src/main/java/org/apache/sysml/runtime/compress/utils/DblArray.java
new file mode 100644
index 0000000..49c163b
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/utils/DblArray.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.utils;
+
+import java.util.Arrays;
+
+/**
+ * Helper class used for bitmap extraction.
+ *
+ */
+public class DblArray
+{
+ private double[] _arr = null;
+ private boolean _zero = false;
+
+ public DblArray() {
+ this(null, false);
+ }
+
+ public DblArray(double[] arr) {
+ this(arr, false);
+ }
+
+ public DblArray(DblArray that) {
+ this(Arrays.copyOf(that._arr, that._arr.length), that._zero);
+ }
+
+ public DblArray(double[] arr, boolean allZeros) {
+ _arr = arr;
+ _zero = allZeros;
+ }
+
+ public double[] getData() {
+ return _arr;
+ }
+
+ @Override
+ public int hashCode() {
+ return _zero ? 0 : Arrays.hashCode(_arr);
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ return ( o instanceof DblArray
+ && _zero == ((DblArray) o)._zero
+ && Arrays.equals(_arr, ((DblArray) o)._arr) );
+ }
+
+ @Override
+ public String toString() {
+ return Arrays.toString(_arr);
+ }
+
+ /**
+ *
+ * @param ds
+ * @return
+ */
+ public static boolean isZero(double[] ds) {
+ for (int i = 0; i < ds.length; i++)
+ if (ds[i] != 0.0)
+ return false;
+ return true;
+ }
+
+ /**
+ *
+ * @param val
+ * @return
+ */
+ public static boolean isZero(DblArray val) {
+ return val._zero || isZero(val._arr);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/utils/DblArrayIntListHashMap.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/DblArrayIntListHashMap.java b/src/main/java/org/apache/sysml/runtime/compress/utils/DblArrayIntListHashMap.java
new file mode 100644
index 0000000..a5455ab
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/utils/DblArrayIntListHashMap.java
@@ -0,0 +1,179 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.utils;
+
+import java.util.ArrayList;
+
+/**
+ * This class provides a memory-efficient replacement for
+ * HashMap<DblArray,IntArrayList> for restricted use cases.
+ *
+ */
+public class DblArrayIntListHashMap
+{
+ private static final int INIT_CAPACITY = 8;
+ private static final int RESIZE_FACTOR = 2;
+ private static final float LOAD_FACTOR = 0.75f;
+
+ private DArrayIListEntry[] _data = null;
+ private int _size = -1;
+
+ public DblArrayIntListHashMap() {
+ _data = new DArrayIListEntry[INIT_CAPACITY];
+ _size = 0;
+ }
+
+ /**
+ *
+ * @return
+ */
+ public int size() {
+ return _size;
+ }
+
+ /**
+ *
+ * @param key
+ * @return
+ */
+ public IntArrayList get(DblArray key) {
+ // probe for early abort
+ if( _size == 0 )
+ return null;
+
+ // compute entry index position
+ int hash = hash(key);
+ int ix = indexFor(hash, _data.length);
+
+ // find entry
+ for( DArrayIListEntry e = _data[ix]; e != null; e = e.next ) {
+ if( e.key.equals(key) ) {
+ return e.value;
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ *
+ * @param key
+ * @param value
+ */
+ public void appendValue(DblArray key, IntArrayList value) {
+ // compute entry index position
+ int hash = hash(key);
+ int ix = indexFor(hash, _data.length);
+
+ // add new table entry (constant time)
+ DArrayIListEntry enew = new DArrayIListEntry(key, value);
+ enew.next = _data[ix]; // colliding entries / null
+ _data[ix] = enew;
+ _size++;
+
+ // resize if necessary
+ if( _size >= LOAD_FACTOR * _data.length )
+ resize();
+ }
+
+ /**
+ *
+ * @return
+ */
+ public ArrayList<DArrayIListEntry> extractValues() {
+ ArrayList<DArrayIListEntry> ret = new ArrayList<DArrayIListEntry>();
+ for( DArrayIListEntry e : _data ) {
+ if( e != null ) {
+ while( e.next != null ) {
+ ret.add(e);
+ e = e.next;
+ }
+ ret.add(e);
+ }
+ }
+
+ return ret;
+ }
+
+ /**
+ *
+ */
+ private void resize() {
+ // check for integer overflow on resize
+ if( _data.length > Integer.MAX_VALUE / RESIZE_FACTOR )
+ return;
+
+ // resize data array and copy existing contents
+ DArrayIListEntry[] olddata = _data;
+ _data = new DArrayIListEntry[_data.length * RESIZE_FACTOR];
+ _size = 0;
+
+ // rehash all entries
+ for( DArrayIListEntry e : olddata ) {
+ if( e != null ) {
+ while( e.next != null ) {
+ appendValue(e.key, e.value);
+ e = e.next;
+ }
+ appendValue(e.key, e.value);
+ }
+ }
+ }
+
+ /**
+ *
+ * @param key
+ * @return
+ */
+ private static int hash(DblArray key) {
+ int h = key.hashCode();
+
+ // This function ensures that hashCodes that differ only by
+ // constant multiples at each bit position have a bounded
+ // number of collisions (approximately 8 at default load factor).
+ h ^= (h >>> 20) ^ (h >>> 12);
+ return h ^ (h >>> 7) ^ (h >>> 4);
+ }
+
+ /**
+ *
+ * @param h
+ * @param length
+ * @return
+ */
+ private static int indexFor(int h, int length) {
+ return h & (length - 1);
+ }
+
+ /**
+ *
+ */
+ public class DArrayIListEntry {
+ public DblArray key;
+ public IntArrayList value;
+ public DArrayIListEntry next;
+
+ public DArrayIListEntry(DblArray ekey, IntArrayList evalue) {
+ key = ekey;
+ value = evalue;
+ next = null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/utils/DoubleIntListHashMap.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/DoubleIntListHashMap.java b/src/main/java/org/apache/sysml/runtime/compress/utils/DoubleIntListHashMap.java
new file mode 100644
index 0000000..5607a3f
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/utils/DoubleIntListHashMap.java
@@ -0,0 +1,181 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.utils;
+
+import java.util.ArrayList;
+
+/**
+ * This class provides a memory-efficient replacement for
+ * HashMap<Double,IntArrayList> for restricted use cases.
+ *
+ */
+public class DoubleIntListHashMap
+{
+ private static final int INIT_CAPACITY = 8;
+ private static final int RESIZE_FACTOR = 2;
+ private static final float LOAD_FACTOR = 0.75f;
+
+ private DIListEntry[] _data = null;
+ private int _size = -1;
+
+ public DoubleIntListHashMap() {
+ _data = new DIListEntry[INIT_CAPACITY];
+ _size = 0;
+ }
+
+ /**
+ *
+ * @return
+ */
+ public int size() {
+ return _size;
+ }
+
+ /**
+ *
+ * @param key
+ * @return
+ */
+ public IntArrayList get(double key) {
+ // probe for early abort
+ if( _size == 0 )
+ return null;
+
+ // compute entry index position
+ int hash = hash(key);
+ int ix = indexFor(hash, _data.length);
+
+ // find entry
+ for( DIListEntry e = _data[ix]; e != null; e = e.next ) {
+ if( e.key == key ) {
+ return e.value;
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ *
+ * @param key
+ * @param value
+ */
+ public void appendValue(double key, IntArrayList value) {
+ // compute entry index position
+ int hash = hash(key);
+ int ix = indexFor(hash, _data.length);
+
+ // add new table entry (constant time)
+ DIListEntry enew = new DIListEntry(key, value);
+ enew.next = _data[ix]; // colliding entries / null
+ _data[ix] = enew;
+ _size++;
+
+ // resize if necessary
+ if( _size >= LOAD_FACTOR * _data.length )
+ resize();
+ }
+
+ /**
+ *
+ * @return
+ */
+ public ArrayList<DIListEntry> extractValues() {
+ ArrayList<DIListEntry> ret = new ArrayList<DIListEntry>();
+ for( DIListEntry e : _data ) {
+ if (e != null) {
+ while( e.next != null ) {
+ ret.add(e);
+ e = e.next;
+ }
+ ret.add(e);
+ }
+ }
+
+ return ret;
+ }
+
+ /**
+ *
+ */
+ private void resize() {
+ // check for integer overflow on resize
+ if( _data.length > Integer.MAX_VALUE / RESIZE_FACTOR )
+ return;
+
+ // resize data array and copy existing contents
+ DIListEntry[] olddata = _data;
+ _data = new DIListEntry[_data.length * RESIZE_FACTOR];
+ _size = 0;
+
+ // rehash all entries
+ for( DIListEntry e : olddata ) {
+ if( e != null ) {
+ while( e.next != null ) {
+ appendValue(e.key, e.value);
+ e = e.next;
+ }
+ appendValue(e.key, e.value);
+ }
+ }
+ }
+
+ /**
+ *
+ * @param key
+ * @return
+ */
+ private static int hash(double key) {
+ // basic double hash code (w/o object creation)
+ long bits = Double.doubleToRawLongBits(key);
+ int h = (int) (bits ^ (bits >>> 32));
+
+ // This function ensures that hashCodes that differ only by
+ // constant multiples at each bit position have a bounded
+ // number of collisions (approximately 8 at default load factor).
+ h ^= (h >>> 20) ^ (h >>> 12);
+ return h ^ (h >>> 7) ^ (h >>> 4);
+ }
+
+ /**
+ *
+ * @param h
+ * @param length
+ * @return
+ */
+ private static int indexFor(int h, int length) {
+ return h & (length - 1);
+ }
+
+ /**
+ *
+ */
+ public class DIListEntry {
+ public double key = Double.MAX_VALUE;
+ public IntArrayList value = null;
+ public DIListEntry next = null;
+
+ public DIListEntry(double ekey, IntArrayList evalue) {
+ key = ekey;
+ value = evalue;
+ next = null;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16e7b1c8/src/main/java/org/apache/sysml/runtime/compress/utils/IntArrayList.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/runtime/compress/utils/IntArrayList.java b/src/main/java/org/apache/sysml/runtime/compress/utils/IntArrayList.java
new file mode 100644
index 0000000..33455a2
--- /dev/null
+++ b/src/main/java/org/apache/sysml/runtime/compress/utils/IntArrayList.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysml.runtime.compress.utils;
+
+import java.util.Arrays;
+
+/**
+ * This class provides a memory-efficient replacement for ArrayList<Integer> for
+ * restricted use cases.
+ *
+ */
+public class IntArrayList
+{
+ private static final int INIT_CAPACITY = 4;
+ private static final int RESIZE_FACTOR = 2;
+
+ private int[] _data = null;
+ private int _size = -1;
+ private int _val0 = -1;
+
+ public IntArrayList() {
+ _data = null;
+ _size = 0;
+ }
+
+ /**
+ *
+ * @return
+ */
+ public int size() {
+ return _size;
+ }
+
+ /**
+ *
+ * @param value
+ */
+ public void appendValue(int value) {
+ // embedded value (no array allocation)
+ if( _size == 0 ) {
+ _val0 = value;
+ _size = 1;
+ return;
+ }
+
+ // allocate or resize array if necessary
+ if( _data == null ) {
+ _data = new int[INIT_CAPACITY];
+ _data[0] = _val0;
+ }
+ else if( _size + 1 >= _data.length ) {
+ resize();
+ }
+
+ // append value
+ _data[_size] = value;
+ _size++;
+ }
+
+ /**
+ *
+ * @return
+ */
+ public int[] extractValues() {
+ if( _size == 1 )
+ return new int[] { _val0 };
+ else
+ return Arrays.copyOfRange(_data, 0, _size);
+ }
+
+ /**
+ *
+ */
+ private void resize() {
+ // check for integer overflow on resize
+ if( _data.length > Integer.MAX_VALUE / RESIZE_FACTOR )
+ throw new RuntimeException(
+ "IntArrayList resize leads to integer overflow: size=" + _size);
+
+ // resize data array and copy existing contents
+ int[] newdata = new int[_data.length * RESIZE_FACTOR];
+ System.arraycopy(_data, 0, newdata, 0, _size);
+ _data = newdata;
+ }
+}