You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2014/09/04 01:59:47 UTC

svn commit: r1622379 - in /pig/trunk: ./ src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/ src/org/apache/pig/impl/builtin/ test/org/apache/pig/test/

Author: daijy
Date: Wed Sep  3 23:59:46 2014
New Revision: 1622379

URL: http://svn.apache.org/r1622379
Log:
PIG-4149: Rounding issue in FindQuantiles

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
    pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java
    pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Sep  3 23:59:46 2014
@@ -70,6 +70,8 @@ OPTIMIZATIONS
  
 BUG FIXES
 
+PIG-4149: Rounding issue in FindQuantiles (daijy)
+
 PIG-4145: Port local mode tests to Tez - part1 (daijy)
 
 PIG-4076: Fix pom file (daijy)

Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java Wed Sep  3 23:59:46 2014
@@ -25,7 +25,7 @@ import org.apache.commons.logging.LogFac
 public class DiscreteProbabilitySampleGenerator {
     Random rGen;
     float[] probVec;
-    float epsilon = 0.00001f;
+    float epsilon = 0.0001f;
         
     private static final Log LOG = LogFactory.getLog(DiscreteProbabilitySampleGenerator.class);
     

Modified: pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java (original)
+++ pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java Wed Sep  3 23:59:46 2014
@@ -172,7 +172,7 @@ public class FindQuantiles extends EvalF
                 samples = (DataBag)in.get(1);
             }
             long numSamples = samples.size();
-            long toSkip = numSamples / numQuantiles;
+            double toSkip = (double)numSamples / numQuantiles;
             if(toSkip == 0) {
                 // numSamples is < numQuantiles;
                 // set numQuantiles to numSamples
@@ -180,9 +180,10 @@ public class FindQuantiles extends EvalF
                 toSkip = 1;
             }
             
-            long ind=0, j=-1, nextQuantile = toSkip-1;
+            long ind=0, j=-1;
+            double nextQuantile = toSkip-1;
             for (Tuple it : samples) {
-                if (ind==nextQuantile){
+                if (ind==(long)nextQuantile){
                     ++j;
                     quantilesList.add(it);
                     nextQuantile+=toSkip;

Modified: pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java Wed Sep  3 23:59:46 2014
@@ -20,13 +20,18 @@ package org.apache.pig.test;
 
 import static org.junit.Assert.assertTrue;
 
+import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Map;
+import java.util.Random;
 
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.DiscreteProbabilitySampleGenerator;
 import org.apache.pig.data.BagFactory;
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.InternalMap;
+import org.apache.pig.data.NonSpillableDataBag;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.impl.builtin.FindQuantiles;
@@ -35,7 +40,7 @@ import org.junit.Test;
 public class TestFindQuantiles {
     
     private static TupleFactory tFact = TupleFactory.getInstance();
-    private static final float epsilon = 0.00001f;
+    private static final float epsilon = 0.0001f;
     
     @Test
     public void testFindQuantiles() throws Exception {
@@ -43,7 +48,7 @@ public class TestFindQuantiles {
        final int numReducers = 1009;
        float sum = getProbVecSum(numSamples, numReducers);
        System.out.println("sum: " + sum);
-       assertTrue(sum > (1+epsilon));
+       assertTrue(sum > (1-epsilon) && sum < (1+epsilon));
     }
     
     @Test
@@ -52,9 +57,34 @@ public class TestFindQuantiles {
        final int numReducers = 3000;
        float sum = getProbVecSum(numSamples, numReducers);
        System.out.println("sum: " + sum);
-       assertTrue(sum < (1-epsilon));
+       assertTrue(sum > (1-epsilon) && sum < (1+epsilon));
     }
-    
+
+    @Test
+    public void testFindQuantilesRemainder() throws Exception {
+       final int numSamples = 1900;
+       final int numReducers = 300;
+       DataBag samples = generateRandomSortedSamples(numSamples, 365);
+       Map<String, Object> findQuantilesResult = getFindQuantilesResult(samples, numReducers);
+       DataBag quantilesBag = (DataBag)findQuantilesResult.get(FindQuantiles.QUANTILES_LIST);
+       Iterator<Tuple> iter = quantilesBag.iterator();
+       Tuple lastQuantile = null;
+       while (iter.hasNext()) {
+           lastQuantile = iter.next();
+       }
+       int lastQuantileNum = (Integer)lastQuantile.get(0);
+       int count = 0;
+       iter = samples.iterator();
+       while (iter.hasNext()) {
+           Tuple t = iter.next();
+           int num = (Integer)t.get(0);
+           if (num >= lastQuantileNum) {
+               count++;
+           }
+       }
+       assertTrue((double)count/numSamples <= 1.0/365 + 0.001);
+    }
+
     private float[] getProbVec(Tuple values) throws Exception {
         float[] probVec = new float[values.size()];        
         for(int i = 0; i < values.size(); i++) {
@@ -62,22 +92,46 @@ public class TestFindQuantiles {
         }
         return probVec;
     }
-    
-    private float getProbVecSum(int numSamples, int numReduceres) throws Exception {
-        Tuple in = tFact.newTuple(2);
+
+    private DataBag generateRandomSortedSamples(int numSamples, int max) throws Exception {
+        Random rand = new Random(1000);
+        List<Tuple> samples = new ArrayList<Tuple>(); 
+        for (int i=0; i<numSamples; i++) {
+            Tuple t = tFact.newTuple(1);
+            t.set(0, rand.nextInt(max));
+            samples.add(t);
+        }
+        Collections.sort(samples);
+        return new NonSpillableDataBag(samples);
+    }
+
+    private DataBag generateUniqueSamples(int numSamples) throws Exception {
         DataBag samples = BagFactory.getInstance().newDefaultBag(); 
         for (int i=0; i<numSamples; i++) {
             Tuple t = tFact.newTuple(1);
             t.set(0, new Integer(23));
             samples.add(t);
         }
+        return samples;
+    }
+
+    private Map<String, Object> getFindQuantilesResult(DataBag samples,
+            int numReduceres) throws Exception {
+        Tuple in = tFact.newTuple(2);
+
         in.set(0, new Integer(numReduceres));
         in.set(1, samples);
         
         FindQuantiles fq = new FindQuantiles();
         
         Map<String, Object> res = fq.exec(in);
-        
+        return res;
+    }
+
+    private float getProbVecSum(int numSamples, int numReduceres) throws Exception {
+        DataBag samples = generateUniqueSamples(numSamples);
+        Map<String, Object> res = getFindQuantilesResult(samples, numReduceres);
+
         InternalMap weightedPartsData = (InternalMap) res.get(FindQuantiles.WEIGHTED_PARTS);
         Iterator<Object> it = weightedPartsData.values().iterator();
         float[] probVec = getProbVec((Tuple)it.next());