You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ar...@apache.org on 2022/01/31 18:59:52 UTC

[systemds] branch main updated: [SYSTEMDS-3284] Avoid binary search for equi-width binning apply

This is an automated email from the ASF dual-hosted git repository.

arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/main by this push:
     new 4eb1db5  [SYSTEMDS-3284] Avoid binary search for equi-width binning apply
4eb1db5 is described below

commit 4eb1db5b59e954a133271ae30b4e17e3cfc303f0
Author: arnabp <ar...@tugraz.at>
AuthorDate: Mon Jan 31 19:56:16 2022 +0100

    [SYSTEMDS-3284] Avoid binary search for equi-width binning apply
    
    This patch replaces the binary search with basic derivation
    for finding the right bin for a value. This change yields
    10% speed-up for a single-threaded binning of 10 columns
    with 100K bins per column.
---
 .../runtime/transform/encode/ColumnEncoderBin.java | 28 ++++++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
index d6e79c4..fdc1ad6 100644
--- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
+++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
@@ -106,6 +106,7 @@ public class ColumnEncoderBin extends ColumnEncoder {
 
 	protected double getCode(CacheBlock in, int row){
 		// find the right bucket for a single row
+		double bin = 0;
 		if( _binMins.length == 0 || _binMaxs.length == 0 ) {
 			LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1");
 			return 1; //robustness in case of missing bins
@@ -114,15 +115,24 @@ public class ColumnEncoderBin extends ColumnEncoder {
 		double inVal = in.getDoubleNaN(row, _colID - 1);
 		if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1])
 			return Double.NaN;
-		int ix = Arrays.binarySearch(_binMaxs, inVal);
-		return ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+		if (_binMethod == BinMethod.EQUI_HEIGHT) {
+			int ix = Arrays.binarySearch(_binMaxs, inVal);
+			bin = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+		}
+		if (_binMethod == BinMethod.EQUI_WIDTH) {
+			//TODO: Skip computing bin boundaries for equi-width
+			double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin;
+			double code = Math.ceil((inVal - _binMins[0]) / binWidth);
+			bin = (code == 0) ? code + 1 : code;
+		}
+		return bin;
 	}
 	
 	@Override
 	protected double[] getCodeCol(CacheBlock in, int startInd, int blkSize) {
 		// find the right bucket for a block of rows
 		int endInd = getEndIndex(in.getNumRows(), startInd, blkSize);
-		double codes[] = new double[endInd-startInd];
+		double[] codes = new double[endInd-startInd];
 		for (int i=startInd; i<endInd; i++) {
 			if (_binMins.length == 0 || _binMaxs.length == 0) {
 				LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1");
@@ -134,8 +144,16 @@ public class ColumnEncoderBin extends ColumnEncoder {
 				codes[i-startInd] = Double.NaN;
 				continue;
 			}
-			int ix = Arrays.binarySearch(_binMaxs, inVal);
-			codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+			if (_binMethod == BinMethod.EQUI_HEIGHT) {
+				int ix = Arrays.binarySearch(_binMaxs, inVal);
+				codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+			}
+			if (_binMethod == BinMethod.EQUI_WIDTH) {
+				//TODO: Skip computing bin boundaries for equi-width
+				double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin;
+				double bin = Math.ceil((inVal - _binMins[0]) / binWidth);
+				codes[i - startInd] = bin == 0 ? bin + 1 : bin;
+			}
 		}
 		return codes;
 	}