You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by rd...@apache.org on 2010/10/07 01:42:12 UTC

svn commit: r1005293 - in /pig/trunk: ./ src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/ test/org/apache/pig/test/

Author: rding
Date: Wed Oct  6 23:42:11 2010
New Revision: 1005293

URL: http://svn.apache.org/viewvc?rev=1005293&view=rev
Log:
PIG-1668: Order by failed with RuntimeException

Added:
    pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
Removed:
    pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/MalFormedProbVecException.java
Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1005293&r1=1005292&r2=1005293&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Oct  6 23:42:11 2010
@@ -209,6 +209,8 @@ PIG-1309: Map-side Cogroup (ashutoshc)
 
 BUG FIXES
 
+PIG-1668: Order by failed with RuntimeException (rding)
+
 PIG-1659: sortinfo is not set for store if there is a filter after ORDER BY (daijy)
 
 PIG-1664: leading '_' in directory/file names should be ignored; the "pigtest" build target should include all pig-related zebra tests. (yanz)

Modified: pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java
URL: http://svn.apache.org/viewvc/pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java?rev=1005293&r1=1005292&r2=1005293&view=diff
==============================================================================
--- pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java (original)
+++ pig/trunk/src/org/apache/pig/backend/hadoop/executionengine/mapReduceLayer/partitioners/DiscreteProbabilitySampleGenerator.java Wed Oct  6 23:42:11 2010
@@ -19,49 +19,49 @@ package org.apache.pig.backend.hadoop.ex
 import java.util.Arrays;
 import java.util.Random;
 
-import org.apache.pig.PigException;
-
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 
 public class DiscreteProbabilitySampleGenerator {
     Random rGen;
     float[] probVec;
     float epsilon = 0.00001f;
         
-    public DiscreteProbabilitySampleGenerator(float[] probVec) throws MalFormedProbVecException{
+    private static final Log LOG = LogFactory.getLog(DiscreteProbabilitySampleGenerator.class);
+    
+    public DiscreteProbabilitySampleGenerator(float[] probVec) {
         rGen = new Random();
         float sum = 0.0f;
         for (float f : probVec) {
             sum += f;
         }
-        if(1-epsilon<=sum && sum<=1+epsilon) 
-            this.probVec = probVec;
-        else {
-            int errorCode = 2122;
-            String message = "Sum of probabilities should be one: " + Arrays.toString(probVec);
-            throw new MalFormedProbVecException(message, errorCode, PigException.BUG);
+        this.probVec = probVec;
+        if (1-epsilon > sum || sum > 1+epsilon) { 
+            LOG.info("Sum of probabilities should be near one: " + sum);
         }
     }
     
     public int getNext(){
         double toss = rGen.nextDouble();
         // if the uniformly random number that I generated
-        // is in the probability range for a given parition,
-        // pick that parition
+        // is in the probability range for a given partition,
+        // pick that partition
         // For some sample item which occurs only in partitions
         // 1 and 2
         // say probVec[1] = 0.3
         // and probVec[2] = 0.7
-        // if our coin toss generate < 0.3, we pick 1 otherwise
-        // we pick 2
+        // if our coin toss generate < 0.3, we pick 1 otherwise we pick 2
+        int lastIdx = -1;
         for(int i=0;i<probVec.length;i++){
+            if (probVec[i] != 0) lastIdx = i;
             toss -= probVec[i];
             if(toss<=0.0)
                 return i;
-        }
-        return -1;
+        }        
+        return lastIdx;
     }
     
-    public static void main(String[] args) throws MalFormedProbVecException {
+    public static void main(String[] args) {
         float[] vec = { 0, 0.3f, 0.2f, 0, 0, 0.5f };
         DiscreteProbabilitySampleGenerator gen = new DiscreteProbabilitySampleGenerator(vec);
         CountingMap<Integer> cm = new CountingMap<Integer>();
@@ -73,7 +73,6 @@ public class DiscreteProbabilitySampleGe
 
     @Override
     public String toString() {
-        // TODO Auto-generated method stub
         return Arrays.toString(probVec);
     }
     

Added: pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java?rev=1005293&view=auto
==============================================================================
--- pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java (added)
+++ pig/trunk/test/org/apache/pig/test/TestFindQuantiles.java Wed Oct  6 23:42:11 2010
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.pig.test;
+
+
+import static org.junit.Assert.assertTrue;
+
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.partitioners.DiscreteProbabilitySampleGenerator;
+import org.apache.pig.data.BagFactory;
+import org.apache.pig.data.DataBag;
+import org.apache.pig.data.InternalMap;
+import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
+import org.apache.pig.impl.builtin.FindQuantiles;
+import org.junit.Test;
+
+public class TestFindQuantiles {
+    
+    private static TupleFactory tFact = TupleFactory.getInstance();
+    private static final float epsilon = 0.00001f;
+    
+    @Test
+    public void testFindQuantiles() throws Exception {
+       final int numSamples = 97778;
+       final int numReducers = 1009;
+       float sum = getProbVecSum(numSamples, numReducers);
+       System.out.println("sum: " + sum);
+       assertTrue(sum > (1+epsilon));
+    }
+    
+    @Test
+    public void testFindQuantiles2() throws Exception {
+       final int numSamples = 30000;
+       final int numReducers = 3000;
+       float sum = getProbVecSum(numSamples, numReducers);
+       System.out.println("sum: " + sum);
+       assertTrue(sum < (1-epsilon));
+    }
+    
+    private float[] getProbVec(Tuple values) throws Exception {
+        float[] probVec = new float[values.size()];        
+        for(int i = 0; i < values.size(); i++) {
+            probVec[i] = (Float)values.get(i);
+        }
+        return probVec;
+    }
+    
+    private float getProbVecSum(int numSamples, int numReduceres) throws Exception {
+        Tuple in = tFact.newTuple(2);
+        DataBag samples = BagFactory.getInstance().newDefaultBag(); 
+        for (int i=0; i<numSamples; i++) {
+            Tuple t = tFact.newTuple(1);
+            t.set(0, new Integer(23));
+            samples.add(t);
+        }
+        in.set(0, new Integer(numReduceres));
+        in.set(1, samples);
+        
+        FindQuantiles fq = new FindQuantiles();
+        
+        Map<String, Object> res = fq.exec(in);
+        
+        InternalMap weightedPartsData = (InternalMap) res.get(FindQuantiles.WEIGHTED_PARTS);
+        Iterator<Object> it = weightedPartsData.values().iterator();
+        float[] probVec = getProbVec((Tuple)it.next());
+        new DiscreteProbabilitySampleGenerator(probVec);
+        float sum = 0.0f;
+        for (float f : probVec) {
+            sum += f;
+        }
+        return sum;
+    }
+    
+}