You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sd...@apache.org on 2012/09/07 19:53:05 UTC
svn commit: r1382101 -
/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java
Author: sdong
Date: Fri Sep 7 17:53:04 2012
New Revision: 1382101
URL: http://svn.apache.org/viewvc?rev=1382101&view=rev
Log:
HIVE-3388 Improve Performance of UDF PERCENTILE_APPROX() (Rongrong Zhong via Siying Dong)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java?rev=1382101&r1=1382100&r2=1382101&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java Fri Sep 7 17:53:04 2012
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.udf.ge
import java.util.ArrayList;
import java.util.List;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Random;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
@@ -55,7 +56,7 @@ public class NumericHistogram {
// Class variables
private int nbins;
private int nusedbins;
- private Coord[] bins;
+ private ArrayList<Coord> bins;
private Random prng;
/**
@@ -101,7 +102,7 @@ public class NumericHistogram {
* Returns a particular histogram bin.
*/
public Coord getBin(int b) {
- return bins[b];
+ return bins.get(b);
}
/**
@@ -111,10 +112,7 @@ public class NumericHistogram {
*/
public void allocate(int num_bins) {
nbins = num_bins;
- bins = new Coord[nbins+1];
- for(int i = 0; i < nbins+1; i++) {
- bins[i] = new Coord();
- }
+ bins = new ArrayList<Coord>();
nusedbins = 0;
}
@@ -135,31 +133,32 @@ public class NumericHistogram {
// by deserializing the ArrayList of (x,y) pairs into an array of Coord objects
nbins = (int) other.get(0).get();
nusedbins = (other.size()-1)/2;
- bins = new Coord[nbins+1]; // +1 to hold a temporary bin for insert()
- for(int i = 1; i < other.size(); i+=2) {
- bins[(i-1)/2] = new Coord();
- bins[(i-1)/2].x = other.get(i).get();
- bins[(i-1)/2].y = other.get(i+1).get();
+ bins = new ArrayList<Coord>(nusedbins);
+ for (int i = 1; i < other.size(); i+=2) {
+ Coord bin = new Coord();
+ bin.x = other.get(i).get();
+ bin.y = other.get(i+1).get();
+ bins.add(bin);
}
} else {
// The aggregation buffer already contains a partial histogram. Therefore, we need
// to merge histograms using Algorithm #2 from the Ben-Haim and Tom-Tov paper.
- Coord[] tmp_bins = new Coord[nusedbins + (other.size()-1)/2];
- for(int j = 0; j < tmp_bins.length; j++) {
- tmp_bins[j] = new Coord();
- }
+ ArrayList<Coord> tmp_bins = new ArrayList<Coord>(nusedbins + (other.size()-1)/2);
// Copy all the histogram bins from us and 'other' into an overstuffed histogram
- int i;
- for(i = 0; i < nusedbins; i++) {
- tmp_bins[i].x = bins[i].x;
- tmp_bins[i].y = bins[i].y;
+ for (int i = 0; i < nusedbins; i++) {
+ Coord bin = new Coord();
+ bin.x = bins.get(i).x;
+ bin.y = bins.get(i).y;
+ tmp_bins.add(bin);
+ }
+ for (int j = 1; j < other.size(); j += 2) {
+ Coord bin = new Coord();
+ bin.x = other.get(j).get();
+ bin.y = other.get(j+1).get();
+ tmp_bins.add(bin);
}
- for(int j = 1; j < other.size(); j+=2, i++) {
- tmp_bins[i].x = other.get(j).get();
- tmp_bins[i].y = other.get(j+1).get();
- }
- Arrays.sort(tmp_bins);
+ Collections.sort(tmp_bins);
// Now trim the overstuffed histogram down to the correct number of bins
bins = tmp_bins;
@@ -185,10 +184,10 @@ public class NumericHistogram {
int bin = 0;
for(int l=0, r=nusedbins; l < r; ) {
bin = (l+r)/2;
- if(bins[bin].x > v) {
+ if (bins.get(bin).x > v) {
r = bin;
} else {
- if(bins[bin].x < v) {
+ if (bins.get(bin).x < v) {
l = ++bin;
} else {
break; // break loop on equal comparator
@@ -202,21 +201,20 @@ public class NumericHistogram {
// assumed to be equal to the closest bin -- if fabs(v-bins[bin].x) < THRESHOLD, then
// just increment 'bin'. This is not done now because we don't want to make any
// assumptions about the range of numeric data being analyzed.
- if(bin < nusedbins && bins[bin].x == v) {
- bins[bin].y++;
+ if (bin < nusedbins && bins.get(bin).x == v) {
+ bins.get(bin).y++;
} else {
- for(int i = nusedbins; i > bin; i--) {
- bins[i].x = bins[i-1].x;
- bins[i].y = bins[i-1].y;
- }
- bins[bin].x = v; // new bins bin for value 'v'
- bins[bin].y = 1; // of height 1 unit
+ Coord newBin = new Coord();
+ newBin.x = v;
+ newBin.y = 1;
+ bins.add(bin, newBin);
// Trim the bins down to the correct number of bins.
- if(++nusedbins > nbins) {
+ if (++nusedbins > nbins) {
trim();
}
}
+
}
/**
@@ -227,10 +225,10 @@ public class NumericHistogram {
private void trim() {
while(nusedbins > nbins) {
// Find the closest pair of bins in terms of x coordinates. Break ties randomly.
- double smallestdiff = bins[1].x - bins[0].x;
+ double smallestdiff = bins.get(1).x - bins.get(0).x;
int smallestdiffloc = 0, smallestdiffcount = 1;
for(int i = 1; i < nusedbins-1; i++) {
- double diff = bins[i+1].x - bins[i].x;
+ double diff = bins.get(i+1).x - bins.get(i).x;
if(diff < smallestdiff) {
smallestdiff = diff;
smallestdiffloc = i;
@@ -244,17 +242,19 @@ public class NumericHistogram {
// Merge the two closest bins into their average x location, weighted by their heights.
// The height of the new bin is the sum of the heights of the old bins.
- double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y;
- bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d;
- bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d *
- bins[smallestdiffloc+1].y;
- bins[smallestdiffloc].y = d;
-
+ // double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y;
+ // bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d;
+ // bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d *
+ // bins[smallestdiffloc+1].y;
+ // bins[smallestdiffloc].y = d;
+
+ double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc+1).y;
+ Coord smallestdiffbin = bins.get(smallestdiffloc);
+ smallestdiffbin.x *= smallestdiffbin.y / d;
+ smallestdiffbin.x += bins.get(smallestdiffloc+1).x / d * bins.get(smallestdiffloc+1).y;
+ smallestdiffbin.y = d;
// Shift the remaining bins left one position
- for(int i = smallestdiffloc+1; i < nusedbins-1; i++) {
- bins[i].x = bins[i+1].x;
- bins[i].y = bins[i+1].y;
- }
+ bins.remove(smallestdiffloc+1);
nusedbins--;
}
}
@@ -271,16 +271,18 @@ public class NumericHistogram {
double sum = 0, csum = 0;
int b;
for(b = 0; b < nusedbins; b++) {
- sum += bins[b].y;
+ sum += bins.get(b).y;
}
for(b = 0; b < nusedbins; b++) {
- csum += bins[b].y;
+ csum += bins.get(b).y;
if(csum / sum >= q) {
if(b == 0) {
- return bins[b].x;
+ return bins.get(b).x;
}
- csum -= bins[b].y;
- double r = bins[b-1].x + (q*sum - csum) * (bins[b].x-bins[b-1].x)/(bins[b].y);
+
+ csum -= bins.get(b).y;
+ double r = bins.get(b-1).x +
+ (q*sum - csum) * (bins.get(b).x - bins.get(b-1).x)/(bins.get(b).y);
return r;
}
}
@@ -304,8 +306,8 @@ public class NumericHistogram {
result.add(new DoubleWritable(nbins));
if(bins != null) {
for(int i = 0; i < nusedbins; i++) {
- result.add(new DoubleWritable(bins[i].x));
- result.add(new DoubleWritable(bins[i].y));
+ result.add(new DoubleWritable(bins.get(i).x));
+ result.add(new DoubleWritable(bins.get(i).y));
}
}