You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by rd...@apache.org on 2010/10/07 01:42:12 UTC
svn commit: r1005293 - in /pig/trunk: ./
src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/
test/org/apache/pig/test/
Author: rding
Date: Wed Oct 6 23:42:11 2010
New Revision: 1005293
URL: http://svn.apache.org/viewvc?rev=1005293&view=rev
Log:
PIG-1668: Order by failed with RuntimeException
Added:
pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
Removed:
pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/MalFormedProbVecException.java
Modified:
pig/trunk/CHANGES.txt
pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1005293&r1=1005292&r2=1005293&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Oct 6 23:42:11 2010
@@ -209,6 +209,8 @@ PIG-1309: Map-side Cogroup (ashutoshc)
BUG FIXES
+PIG-1668: Order by failed with RuntimeException (rding)
+
PIG-1659: sortinfo is not set for store if there is a filter after ORDER BY (daijy)
PIG-1664: leading '_' in directory/file names should be ignored; the "pigtest" build target should include all pig-related zebra tests. (yanz)
Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java?rev=1005293&r1=1005292&r2=1005293&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java Wed Oct 6 23:42:11 2010
@@ -19,49 +19,49 @@ package org.apache.pig.backend.hadoop.ex
import java.util.Arrays;
import java.util.Random;
-import org.apache.pig.PigException;
-
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
public class DiscreteProbabilitySampleGenerator {
Random rGen;
float[] probVec;
float epsilon = 0.00001f;
- public DiscreteProbabilitySampleGenerator(float[] probVec) throws MalFormedProbVecException{
+ private static final Log LOG = LogFactory.getLog(DiscreteProbabilitySampleGenerator.class);
+
+ public DiscreteProbabilitySampleGenerator(float[] probVec) {
rGen = new Random();
float sum = 0.0f;
for (float f : probVec) {
sum += f;
}
- if(1-epsilon<=sum && sum<=1+epsilon)
- this.probVec = probVec;
- else {
- int errorCode = 2122;
- String message = "Sum of probabilities should be one: " + Arrays.toString(probVec);
- throw new MalFormedProbVecException(message, errorCode, PigException.BUG);
+ this.probVec = probVec;
+ if (1-epsilon > sum || sum > 1+epsilon) {
+ LOG.info("Sum of probabilities should be near one: " + sum);
}
}
public int getNext(){
double toss = rGen.nextDouble();
// if the uniformly random number that I generated
- // is in the probability range for a given parition,
- // pick that parition
+ // is in the probability range for a given partition,
+ // pick that partition
// For some sample item which occurs only in partitions
// 1 and 2
// say probVec[1] = 0.3
// and probVec[2] = 0.7
- // if our coin toss generate < 0.3, we pick 1 otherwise
- // we pick 2
+ // if our coin toss generate < 0.3, we pick 1 otherwise we pick 2
+ int lastIdx = -1;
for(int i=0;i<probVec.length;i++){
+ if (probVec[i] != 0) lastIdx = i;
toss -= probVec[i];
if(toss<=0.0)
return i;
- }
- return -1;
+ }
+ return lastIdx;
}
- public static void main(String[] args) throws MalFormedProbVecException {
+ public static void main(String[] args) {
float[] vec = { 0, 0.3f, 0.2f, 0, 0, 0.5f };
DiscreteProbabilitySampleGenerator gen = new DiscreteProbabilitySampleGenerator(vec);
CountingMap<Integer> cm = new CountingMap<Integer>();
@@ -73,7 +73,6 @@ public class DiscreteProbabilitySampleGe
@Override
public String toString() {
- // TODO Auto-generated method stub
return Arrays.toString(probVec);
}
Added: pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java?rev=1005293&view=auto
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java (added)
+++ pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java Wed Oct 6 23:42:11 2010
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.test;
+
+
+import static org.junit.Assert.assertTrue;
+
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.DiscreteProbabilitySampleGenerator;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.InternalMap;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.builtin.FindQuantiles;
+import org.junit.Test;
+
+public class TestFindQuantiles {
+
+ private static TupleFactory tFact = TupleFactory.getInstance();
+ private static final float epsilon = 0.00001f;
+
+ @Test
+ public void testFindQuantiles() throws Exception {
+ final int numSamples = 97778;
+ final int numReducers = 1009;
+ float sum = getProbVecSum(numSamples, numReducers);
+ System.out.println("sum: " + sum);
+ assertTrue(sum > (1+epsilon));
+ }
+
+ @Test
+ public void testFindQuantiles2() throws Exception {
+ final int numSamples = 30000;
+ final int numReducers = 3000;
+ float sum = getProbVecSum(numSamples, numReducers);
+ System.out.println("sum: " + sum);
+ assertTrue(sum < (1-epsilon));
+ }
+
+ private float[] getProbVec(Tuple values) throws Exception {
+ float[] probVec = new float[values.size()];
+ for(int i = 0; i < values.size(); i++) {
+ probVec[i] = (Float)values.get(i);
+ }
+ return probVec;
+ }
+
+ private float getProbVecSum(int numSamples, int numReduceres) throws Exception {
+ Tuple in = tFact.newTuple(2);
+ DataBag samples = BagFactory.getInstance().newDefaultBag();
+ for (int i=0; i<numSamples; i++) {
+ Tuple t = tFact.newTuple(1);
+ t.set(0, new Integer(23));
+ samples.add(t);
+ }
+ in.set(0, new Integer(numReduceres));
+ in.set(1, samples);
+
+ FindQuantiles fq = new FindQuantiles();
+
+ Map<String, Object> res = fq.exec(in);
+
+ InternalMap weightedPartsData = (InternalMap) res.get(FindQuantiles.WEIGHTED_PARTS);
+ Iterator<Object> it = weightedPartsData.values().iterator();
+ float[] probVec = getProbVec((Tuple)it.next());
+ new DiscreteProbabilitySampleGenerator(probVec);
+ float sum = 0.0f;
+ for (float f : probVec) {
+ sum += f;
+ }
+ return sum;
+ }
+
+}