You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@systemds.apache.org by ar...@apache.org on 2022/01/31 18:59:52 UTC
[systemds] branch main updated: [SYSTEMDS-3284] Avoid binary search for equi-width binning apply
This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new 4eb1db5 [SYSTEMDS-3284] Avoid binary search for equi-width binning apply
4eb1db5 is described below
commit 4eb1db5b59e954a133271ae30b4e17e3cfc303f0
Author: arnabp <ar...@tugraz.at>
AuthorDate: Mon Jan 31 19:56:16 2022 +0100
[SYSTEMDS-3284] Avoid binary search for equi-width binning apply
This patch replaces the binary search with basic derivation
for finding the right bin for a value. This change yields
10% speed-up for a single-threaded binning of 10 columns
with 100K bins per column.
---
.../runtime/transform/encode/ColumnEncoderBin.java | 28 ++++++++++++++++++----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
index d6e79c4..fdc1ad6 100644
--- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
+++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java
@@ -106,6 +106,7 @@ public class ColumnEncoderBin extends ColumnEncoder {
protected double getCode(CacheBlock in, int row){
// find the right bucket for a single row
+ double bin = 0;
if( _binMins.length == 0 || _binMaxs.length == 0 ) {
LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1");
return 1; //robustness in case of missing bins
@@ -114,15 +115,24 @@ public class ColumnEncoderBin extends ColumnEncoder {
double inVal = in.getDoubleNaN(row, _colID - 1);
if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1])
return Double.NaN;
- int ix = Arrays.binarySearch(_binMaxs, inVal);
- return ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+ if (_binMethod == BinMethod.EQUI_HEIGHT) {
+ int ix = Arrays.binarySearch(_binMaxs, inVal);
+ bin = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+ }
+ if (_binMethod == BinMethod.EQUI_WIDTH) {
+ //TODO: Skip computing bin boundaries for equi-width
+ double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin;
+ double code = Math.ceil((inVal - _binMins[0]) / binWidth);
+ bin = (code == 0) ? code + 1 : code;
+ }
+ return bin;
}
@Override
protected double[] getCodeCol(CacheBlock in, int startInd, int blkSize) {
// find the right bucket for a block of rows
int endInd = getEndIndex(in.getNumRows(), startInd, blkSize);
- double codes[] = new double[endInd-startInd];
+ double[] codes = new double[endInd-startInd];
for (int i=startInd; i<endInd; i++) {
if (_binMins.length == 0 || _binMaxs.length == 0) {
LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1");
@@ -134,8 +144,16 @@ public class ColumnEncoderBin extends ColumnEncoder {
codes[i-startInd] = Double.NaN;
continue;
}
- int ix = Arrays.binarySearch(_binMaxs, inVal);
- codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+ if (_binMethod == BinMethod.EQUI_HEIGHT) {
+ int ix = Arrays.binarySearch(_binMaxs, inVal);
+ codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1;
+ }
+ if (_binMethod == BinMethod.EQUI_WIDTH) {
+ //TODO: Skip computing bin boundaries for equi-width
+ double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin;
+ double bin = Math.ceil((inVal - _binMins[0]) / binWidth);
+ codes[i - startInd] = bin == 0 ? bin + 1 : bin;
+ }
}
return codes;
}