You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by sd...@apache.org on 2012/09/07 19:53:05 UTC

svn commit: r1382101 - /hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java

Author: sdong
Date: Fri Sep  7 17:53:04 2012
New Revision: 1382101

URL: http://svn.apache.org/viewvc?rev=1382101&view=rev
Log:
HIVE-3388 Improve Performance of UDF PERCENTILE_APPROX() (Rongrong Zhong via Siying Dong)


Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java
URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java?rev=1382101&r1=1382100&r2=1382101&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java (original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/NumericHistogram.java Fri Sep  7 17:53:04 2012
@@ -20,6 +20,7 @@ package org.apache.hadoop.hive.ql.udf.ge
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Random;
 import org.apache.hadoop.hive.serde2.io.DoubleWritable;
 
@@ -55,7 +56,7 @@ public class NumericHistogram {
   // Class variables
   private int nbins;
   private int nusedbins;
-  private Coord[] bins;
+  private ArrayList<Coord> bins;
   private Random prng;
 
   /**
@@ -101,7 +102,7 @@ public class NumericHistogram {
    * Returns a particular histogram bin.
    */
   public Coord getBin(int b) {
-    return bins[b];
+    return bins.get(b);
   }
 
   /**
@@ -111,10 +112,7 @@ public class NumericHistogram {
    */
   public void allocate(int num_bins) {
     nbins = num_bins;
-    bins = new Coord[nbins+1];
-    for(int i = 0; i < nbins+1; i++) {
-      bins[i] = new Coord();
-    }
+    bins = new ArrayList<Coord>();
     nusedbins = 0;
   }
 
@@ -135,31 +133,32 @@ public class NumericHistogram {
       // by deserializing the ArrayList of (x,y) pairs into an array of Coord objects
       nbins = (int) other.get(0).get();
       nusedbins = (other.size()-1)/2;
-      bins = new Coord[nbins+1]; // +1 to hold a temporary bin for insert()
-      for(int i = 1; i < other.size(); i+=2) {
-        bins[(i-1)/2] = new Coord();
-        bins[(i-1)/2].x = other.get(i).get();
-        bins[(i-1)/2].y = other.get(i+1).get();
+      bins = new ArrayList<Coord>(nusedbins);
+      for (int i = 1; i < other.size(); i+=2) {
+        Coord bin = new Coord();
+        bin.x = other.get(i).get();
+        bin.y = other.get(i+1).get();
+        bins.add(bin);
       }
     } else {
       // The aggregation buffer already contains a partial histogram. Therefore, we need
       // to merge histograms using Algorithm #2 from the Ben-Haim and Tom-Tov paper.
-      Coord[] tmp_bins = new Coord[nusedbins + (other.size()-1)/2];
-      for(int j = 0; j < tmp_bins.length; j++) {
-        tmp_bins[j] = new Coord();
-      }
 
+      ArrayList<Coord> tmp_bins = new ArrayList<Coord>(nusedbins + (other.size()-1)/2);
       // Copy all the histogram bins from us and 'other' into an overstuffed histogram
-      int i;
-      for(i = 0; i < nusedbins; i++) {
-        tmp_bins[i].x = bins[i].x;
-        tmp_bins[i].y = bins[i].y;
+      for (int i = 0; i < nusedbins; i++) {
+        Coord bin = new Coord();
+        bin.x = bins.get(i).x;
+        bin.y = bins.get(i).y;
+        tmp_bins.add(bin);
+      }
+      for (int j = 1; j < other.size(); j += 2) {
+        Coord bin = new Coord();
+        bin.x = other.get(j).get();
+        bin.y = other.get(j+1).get();
+        tmp_bins.add(bin);
       }
-      for(int j = 1; j < other.size(); j+=2, i++) {
-        tmp_bins[i].x = other.get(j).get();
-        tmp_bins[i].y = other.get(j+1).get();
-      }
-      Arrays.sort(tmp_bins);
+      Collections.sort(tmp_bins);
 
       // Now trim the overstuffed histogram down to the correct number of bins
       bins = tmp_bins;
@@ -185,10 +184,10 @@ public class NumericHistogram {
     int bin = 0;
     for(int l=0, r=nusedbins; l < r; ) {
       bin = (l+r)/2;
-      if(bins[bin].x > v) {
+      if (bins.get(bin).x > v) {
         r = bin;
       } else {
-        if(bins[bin].x < v) {
+        if (bins.get(bin).x < v) {
           l = ++bin;
         } else {
           break; // break loop on equal comparator
@@ -202,21 +201,20 @@ public class NumericHistogram {
     // assumed to be equal to the closest bin -- if fabs(v-bins[bin].x) < THRESHOLD, then
     // just increment 'bin'. This is not done now because we don't want to make any
     // assumptions about the range of numeric data being analyzed.
-    if(bin < nusedbins && bins[bin].x == v) {
-      bins[bin].y++;
+    if (bin < nusedbins && bins.get(bin).x == v) {
+      bins.get(bin).y++;
     } else {
-      for(int i = nusedbins; i > bin; i--) {
-        bins[i].x = bins[i-1].x;
-        bins[i].y = bins[i-1].y;
-      }
-      bins[bin].x = v; // new bins bin for value 'v'
-      bins[bin].y = 1; // of height 1 unit
+      Coord newBin = new Coord();
+      newBin.x = v;
+      newBin.y = 1;
+      bins.add(bin, newBin);
 
       // Trim the bins down to the correct number of bins.
-      if(++nusedbins > nbins) {
+      if (++nusedbins > nbins) {
         trim();
       }
     }
+
   }
 
   /**
@@ -227,10 +225,10 @@ public class NumericHistogram {
   private void trim() {
     while(nusedbins > nbins) {
       // Find the closest pair of bins in terms of x coordinates. Break ties randomly.
-      double smallestdiff = bins[1].x - bins[0].x;
+      double smallestdiff = bins.get(1).x - bins.get(0).x;
       int smallestdiffloc = 0, smallestdiffcount = 1;
       for(int i = 1; i < nusedbins-1; i++) {
-        double diff = bins[i+1].x - bins[i].x;
+        double diff = bins.get(i+1).x - bins.get(i).x;
         if(diff < smallestdiff)  {
           smallestdiff = diff;
           smallestdiffloc = i;
@@ -244,17 +242,19 @@ public class NumericHistogram {
 
       // Merge the two closest bins into their average x location, weighted by their heights.
       // The height of the new bin is the sum of the heights of the old bins.
-      double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y;
-      bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d;
-      bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d *
-        bins[smallestdiffloc+1].y;
-      bins[smallestdiffloc].y = d;
-
+      // double d = bins[smallestdiffloc].y + bins[smallestdiffloc+1].y;
+      // bins[smallestdiffloc].x *= bins[smallestdiffloc].y / d;
+      // bins[smallestdiffloc].x += bins[smallestdiffloc+1].x / d *
+      //   bins[smallestdiffloc+1].y;
+      // bins[smallestdiffloc].y = d;
+
+      double d = bins.get(smallestdiffloc).y + bins.get(smallestdiffloc+1).y;
+      Coord smallestdiffbin = bins.get(smallestdiffloc);
+      smallestdiffbin.x *= smallestdiffbin.y / d;
+      smallestdiffbin.x += bins.get(smallestdiffloc+1).x / d * bins.get(smallestdiffloc+1).y;
+      smallestdiffbin.y = d;
       // Shift the remaining bins left one position
-      for(int i = smallestdiffloc+1; i < nusedbins-1; i++) {
-        bins[i].x = bins[i+1].x;
-        bins[i].y = bins[i+1].y;
-      }
+      bins.remove(smallestdiffloc+1);
       nusedbins--;
     }
   }
@@ -271,16 +271,18 @@ public class NumericHistogram {
     double sum = 0, csum = 0;
     int b;
     for(b = 0; b < nusedbins; b++)  {
-      sum += bins[b].y;
+      sum += bins.get(b).y;
     }
     for(b = 0; b < nusedbins; b++) {
-      csum += bins[b].y;
+      csum += bins.get(b).y;
       if(csum / sum >= q) {
         if(b == 0) {
-          return bins[b].x;
+          return bins.get(b).x;
         }
-        csum -= bins[b].y;
-        double r = bins[b-1].x + (q*sum - csum) * (bins[b].x-bins[b-1].x)/(bins[b].y);
+
+        csum -= bins.get(b).y;
+        double r = bins.get(b-1).x +
+          (q*sum - csum) * (bins.get(b).x - bins.get(b-1).x)/(bins.get(b).y);
         return r;
       }
     }
@@ -304,8 +306,8 @@ public class NumericHistogram {
     result.add(new DoubleWritable(nbins));
     if(bins != null) {
       for(int i = 0; i < nusedbins; i++) {
-        result.add(new DoubleWritable(bins[i].x));
-        result.add(new DoubleWritable(bins[i].y));
+        result.add(new DoubleWritable(bins.get(i).x));
+        result.add(new DoubleWritable(bins.get(i).y));
       }
     }