You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2008/10/30 19:33:44 UTC
svn commit: r709222 - in /hadoop/pig/branches/types:
src/org/apache/pig/builtin/DIFF.java
test/org/apache/pig/test/TestBuiltin.java
Author: gates
Date: Thu Oct 30 11:33:43 2008
New Revision: 709222
URL: http://svn.apache.org/viewvc?rev=709222&view=rev
Log:
PIG-511 Fixed flaws in builtin UDF diff pointed out by Crisitan Ivascu.
Modified:
hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java
hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java
Modified: hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java?rev=709222&r1=709221&r2=709222&view=diff
==============================================================================
--- hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java (original)
+++ hadoop/pig/branches/types/src/org/apache/pig/builtin/DIFF.java Thu Oct 30 11:33:43 2008
@@ -18,7 +18,9 @@
package org.apache.pig.builtin;
import java.io.IOException;
+import java.util.HashSet;
import java.util.Iterator;
+import java.util.Set;
import org.apache.pig.EvalFunc;
import org.apache.pig.backend.executionengine.ExecException;
@@ -33,8 +35,6 @@
* will emit any Tuples that are in on of the DataBags but not the other. If the
* fields are values, it will emit tuples with values that do not match.
*
- * @author breed
- *
*/
public class DIFF extends EvalFunc<DataBag> {
TupleFactory mTupleFactory = TupleFactory.getInstance();
@@ -78,48 +78,20 @@
DataBag bag1,
DataBag bag2,
DataBag emitTo) {
- // Create two distinct versions of the bag. This will speed up
- // comparison, and provide us a sorted order so we don't have to do
- // an n^2 lookup.
- DataBag d1 = mBagFactory.newDistinctBag();
- DataBag d2 = mBagFactory.newDistinctBag();
- Iterator<Tuple> i1 = d1.iterator();
- Iterator<Tuple> i2 = d2.iterator();
- while (i1.hasNext()) d1.add(i1.next());
- while (i2.hasNext()) d2.add(i2.next());
-
- i1 = d1.iterator();
- i2 = d2.iterator();
-
- Tuple t1 = i1.next();
- Tuple t2 = i2.next();
-
- while (i1.hasNext() && i2.hasNext()) {
- int c = t1.compareTo(t2);
-
- if (c < 0) {
- // place t1 in the result bag and advance i1
- emitTo.add(t1);
- t1 = i1.next();
- } else if (c > 0) {
- // place t2 in the result bag and advance i2
- emitTo.add(t2);
- t2 = i2.next();
- } else if (c == 0) {
- // put neither in the result bag, advance both iterators
- t1 = i1.next();
- t2 = i2.next();
- }
- }
+ // Build two hash tables and probe with first one, then the other.
+ // This does make the assumption that the distinct set of keys from
+ // each bag will fit in memory.
+ Set<Tuple> s1 = new HashSet<Tuple>();
+ Iterator<Tuple> i1 = bag1.iterator();
+ while (i1.hasNext()) s1.add(i1.next());
+
+ Set<Tuple> s2 = new HashSet<Tuple>();
+ Iterator<Tuple> i2 = bag2.iterator();
+ while (i2.hasNext()) s2.add(i2.next());
+
+ for (Tuple t : s1) if (!s2.contains(t)) emitTo.add(t);
+ for (Tuple t : s2) if (!s1.contains(t)) emitTo.add(t);
- // One ran out, put all the rest of the other (if there are any) in
- // the result bag.
- while (i1.hasNext()) {
- emitTo.add(i1.next());
- }
- while (i2.hasNext()) {
- emitTo.add(i2.next());
- }
}
Modified: hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java
URL: http://svn.apache.org/viewvc/hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java?rev=709222&r1=709221&r2=709222&view=diff
==============================================================================
--- hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java (original)
+++ hadoop/pig/branches/types/test/org/apache/pig/test/TestBuiltin.java Thu Oct 30 11:33:43 2008
@@ -19,6 +19,7 @@
import java.io.File;
import java.io.PrintWriter;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@@ -890,6 +891,44 @@
assertTrue(f1.equals(f2));
}
+
+ @Test
+ public void testDIFF() throws Exception {
+ // Test it in the case with two bags.
+ BagFactory bf = BagFactory.getInstance();
+ TupleFactory tf = TupleFactory.getInstance();
+
+ DataBag b1 = bf.newDefaultBag();
+ DataBag b2 = bf.newDefaultBag();
+ for (int i = 0; i < 10; i++) b1.add(tf.newTuple(new Integer(i)));
+ for (int i = 0; i < 10; i += 2) b2.add(tf.newTuple(new Integer(i)));
+ Tuple t = tf.newTuple(2);
+ t.set(0, b1);
+ t.set(1, b2);
+ DIFF d = new DIFF();
+ DataBag result = d.exec(t);
+
+ assertEquals(5, result.size());
+ Iterator<Tuple> i = result.iterator();
+ int[] values = new int[5];
+ for (int j = 0; j < 5; j++) values[j] = (Integer)i.next().get(0);
+ Arrays.sort(values);
+ for (int j = 1; j < 10; j += 2) assertEquals(j, values[j/2]);
+
+ // Test it in the case of two objects that are equals
+ t = tf.newTuple(2);
+ t.set(0, new Integer(1));
+ t.set(1, new Integer(1));
+ result = d.exec(t);
+ assertEquals(0, result.size());
+
+ // Test it in the case of two objects that are not equal
+ t = tf.newTuple(2);
+ t.set(0, new Integer(1));
+ t.set(1, new Integer(2));
+ result = d.exec(t);
+ assertEquals(2, result.size());
+ }
private static String getInputType(String typeFor) {
return allowedInput.get(typeFor);