You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by da...@apache.org on 2014/09/04 01:59:47 UTC
svn commit: r1622379 - in /pig/trunk: ./
src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/
src/org/apache/pig/impl/builtin/ test/org/apache/pig/test/
Author: daijy
Date: Wed Sep 3 23:59:46 2014
New Revision: 1622379
URL: http://svn.apache.org/r1622379
Log:
PIG-4149: Rounding issue in FindQuantiles
Modified:
pig/trunk/CHANGES.txt
pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java
pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Sep 3 23:59:46 2014
@@ -70,6 +70,8 @@ OPTIMIZATIONS
BUG FIXES
+PIG-4149: Rounding issue in FindQuantiles (daijy)
+
PIG-4145: Port local mode tests to Tez - part1 (daijy)
PIG-4076: Fix pom file (daijy)
Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java Wed Sep 3 23:59:46 2014
@@ -25,7 +25,7 @@ import org.apache.commons.logging.LogFac
public class DiscreteProbabilitySampleGenerator {
Random rGen;
float[] probVec;
- float epsilon = 0.00001f;
+ float epsilon = 0.0001f;
private static final Log LOG = LogFactory.getLog(DiscreteProbabilitySampleGenerator.class);
Modified: pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java (original)
+++ pig/trunk/src/org/apache/pig/impl/builtin/FindQuantiles.java Wed Sep 3 23:59:46 2014
@@ -172,7 +172,7 @@ public class FindQuantiles extends EvalF
samples = (DataBag)in.get(1);
}
long numSamples = samples.size();
- long toSkip = numSamples / numQuantiles;
+ double toSkip = (double)numSamples / numQuantiles;
if(toSkip == 0) {
// numSamples is < numQuantiles;
// set numQuantiles to numSamples
@@ -180,9 +180,10 @@ public class FindQuantiles extends EvalF
toSkip = 1;
}
- long ind=0, j=-1, nextQuantile = toSkip-1;
+ long ind=0, j=-1;
+ double nextQuantile = toSkip-1;
for (Tuple it : samples) {
- if (ind==nextQuantile){
+ if (ind==(long)nextQuantile){
++j;
quantilesList.add(it);
nextQuantile+=toSkip;
Modified: pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java?rev=1622379&r1=1622378&r2=1622379&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java (original)
+++ pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java Wed Sep 3 23:59:46 2014
@@ -20,13 +20,18 @@ package org.apache.pig.test;
import static org.junit.Assert.assertTrue;
+import java.util.ArrayList;
+import java.util.Collections;
import java.util.Iterator;
+import java.util.List;
import java.util.Map;
+import java.util.Random;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.DiscreteProbabilitySampleGenerator;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.InternalMap;
+import org.apache.pig.data.NonSpillableDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.builtin.FindQuantiles;
@@ -35,7 +40,7 @@ import org.junit.Test;
public class TestFindQuantiles {
private static TupleFactory tFact = TupleFactory.getInstance();
- private static final float epsilon = 0.00001f;
+ private static final float epsilon = 0.0001f;
@Test
public void testFindQuantiles() throws Exception {
@@ -43,7 +48,7 @@ public class TestFindQuantiles {
final int numReducers = 1009;
float sum = getProbVecSum(numSamples, numReducers);
System.out.println("sum: " + sum);
- assertTrue(sum > (1+epsilon));
+ assertTrue(sum > (1-epsilon) && sum < (1+epsilon));
}
@Test
@@ -52,9 +57,34 @@ public class TestFindQuantiles {
final int numReducers = 3000;
float sum = getProbVecSum(numSamples, numReducers);
System.out.println("sum: " + sum);
- assertTrue(sum < (1-epsilon));
+ assertTrue(sum > (1-epsilon) && sum < (1+epsilon));
}
-
+
+ @Test
+ public void testFindQuantilesRemainder() throws Exception {
+ final int numSamples = 1900;
+ final int numReducers = 300;
+ DataBag samples = generateRandomSortedSamples(numSamples, 365);
+ Map<String, Object> findQuantilesResult = getFindQuantilesResult(samples, numReducers);
+ DataBag quantilesBag = (DataBag)findQuantilesResult.get(FindQuantiles.QUANTILES_LIST);
+ Iterator<Tuple> iter = quantilesBag.iterator();
+ Tuple lastQuantile = null;
+ while (iter.hasNext()) {
+ lastQuantile = iter.next();
+ }
+ int lastQuantileNum = (Integer)lastQuantile.get(0);
+ int count = 0;
+ iter = samples.iterator();
+ while (iter.hasNext()) {
+ Tuple t = iter.next();
+ int num = (Integer)t.get(0);
+ if (num >= lastQuantileNum) {
+ count++;
+ }
+ }
+ assertTrue((double)count/numSamples <= 1.0/365 + 0.001);
+ }
+
private float[] getProbVec(Tuple values) throws Exception {
float[] probVec = new float[values.size()];
for(int i = 0; i < values.size(); i++) {
@@ -62,22 +92,46 @@ public class TestFindQuantiles {
}
return probVec;
}
-
- private float getProbVecSum(int numSamples, int numReduceres) throws Exception {
- Tuple in = tFact.newTuple(2);
+
+ private DataBag generateRandomSortedSamples(int numSamples, int max) throws Exception {
+ Random rand = new Random(1000);
+ List<Tuple> samples = new ArrayList<Tuple>();
+ for (int i=0; i<numSamples; i++) {
+ Tuple t = tFact.newTuple(1);
+ t.set(0, rand.nextInt(max));
+ samples.add(t);
+ }
+ Collections.sort(samples);
+ return new NonSpillableDataBag(samples);
+ }
+
+ private DataBag generateUniqueSamples(int numSamples) throws Exception {
DataBag samples = BagFactory.getInstance().newDefaultBag();
for (int i=0; i<numSamples; i++) {
Tuple t = tFact.newTuple(1);
t.set(0, new Integer(23));
samples.add(t);
}
+ return samples;
+ }
+
+ private Map<String, Object> getFindQuantilesResult(DataBag samples,
+ int numReduceres) throws Exception {
+ Tuple in = tFact.newTuple(2);
+
in.set(0, new Integer(numReduceres));
in.set(1, samples);
FindQuantiles fq = new FindQuantiles();
Map<String, Object> res = fq.exec(in);
-
+ return res;
+ }
+
+ private float getProbVecSum(int numSamples, int numReduceres) throws Exception {
+ DataBag samples = generateUniqueSamples(numSamples);
+ Map<String, Object> res = getFindQuantilesResult(samples, numReduceres);
+
InternalMap weightedPartsData = (InternalMap) res.get(FindQuantiles.WEIGHTED_PARTS);
Iterator<Object> it = weightedPartsData.values().iterator();
float[] probVec = getProbVec((Tuple)it.next());