You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/02/18 22:18:43 UTC

git commit: DATAFU-31 DistinctBy works incorrectly on string containing minuses

Repository: incubator-datafu
Updated Branches:
  refs/heads/master 16a82e8b4 -> d4a5c5d43


DATAFU-31 DistinctBy works incorrectly on string containing minuses

https://issues.apache.org/jira/browse/DATAFU-31

Signed-off-by: Matt Hayes <mh...@linkedin.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/d4a5c5d4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/d4a5c5d4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/d4a5c5d4

Branch: refs/heads/master
Commit: d4a5c5d434c33c6b614ae08ee4179661fda0d358
Parents: 16a82e8
Author: Jian J. Wang <wj...@dogfavorshot-lm.peking.corp.yahoo.com>
Authored: Sun Feb 16 20:58:02 2014 +0800
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Tue Feb 18 13:18:06 2014 -0800

----------------------------------------------------------------------
 src/java/datafu/pig/bags/DistinctBy.java    | 25 +++++-----
 test/pig/datafu/test/pig/bags/BagTests.java | 62 ++++++++++++++++++++++++
 2 files changed, 74 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/d4a5c5d4/src/java/datafu/pig/bags/DistinctBy.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/bags/DistinctBy.java b/src/java/datafu/pig/bags/DistinctBy.java
index 0bbb0e6..a79e4de 100644
--- a/src/java/datafu/pig/bags/DistinctBy.java
+++ b/src/java/datafu/pig/bags/DistinctBy.java
@@ -28,6 +28,7 @@ import org.apache.pig.data.BagFactory;
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
 import org.apache.pig.impl.logicalLayer.FrontendException;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 
@@ -60,9 +61,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
  */
 public class DistinctBy extends AccumulatorEvalFunc<DataBag>
 {
-  private final static String delimiter = "-";
   private HashSet<Integer> fields = new HashSet<Integer>();
-  private HashSet<String> seen = new HashSet<String>();   
+  private HashSet<Tuple> seen = new HashSet<Tuple>();
   private DataBag outputBag;
   
   public DistinctBy(String... fields)
@@ -85,10 +85,10 @@ public class DistinctBy extends AccumulatorEvalFunc<DataBag>
     
     DataBag inputBag = (DataBag)input.get(0);
     for (Tuple t : inputBag) {
-      String distinctString = getDistinctString(t, this.fields);
-      if (!seen.contains(distinctString)) {
+      Tuple distinctFieldTuple = getDistinctFieldTuple(t, this.fields);
+      if (!seen.contains(distinctFieldTuple)) {
         outputBag.add(t);
-        seen.add(distinctString);
+        seen.add(distinctFieldTuple);
       }
     }
   }
@@ -147,17 +147,16 @@ public class DistinctBy extends AccumulatorEvalFunc<DataBag>
     }
   }
   
-  private String getDistinctString(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException {
-    String[] tokens = t.toDelimitedString(delimiter).split(delimiter);
-    StringBuffer buffer = new StringBuffer();
-    for(int i=0; i<tokens.length; i++) {
+  private Tuple getDistinctFieldTuple(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException {
+    Tuple fieldTuple = TupleFactory.getInstance().newTuple(distinctFieldPositions.size());
+    int idx = 0;
+    for(int i=0; i<t.size(); i++) {
       if (distinctFieldPositions.contains(i)) {
-        buffer.append(tokens[i]);
-        buffer.append(delimiter);
+        fieldTuple.set(idx, t.get(i));
+        idx++;
       }
     }
-    buffer.substring(0, buffer.length() - delimiter.length());
-    return buffer.toString();
+    return fieldTuple;
   }
 
 }

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/d4a5c5d4/test/pig/datafu/test/pig/bags/BagTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/bags/BagTests.java b/test/pig/datafu/test/pig/bags/BagTests.java
index c9e3f63..80bb0cc 100644
--- a/test/pig/datafu/test/pig/bags/BagTests.java
+++ b/test/pig/datafu/test/pig/bags/BagTests.java
@@ -708,6 +708,68 @@ public class BagTests extends PigTests
                  "({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})",
                  "({(A,10,2),(M,50,3),(Z,49,22),(B,1,1)})");
   }
+ 
+  /**
+  register $JAR_PATH
+
+  define DistinctBy datafu.pig.bags.DistinctBy('1', '2');
+
+  data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:map[INT], c:bag{t: tuple(c0:CHARARRAY, c1:INT)})});
+
+  data2 = FOREACH data GENERATE DistinctBy(data);
+
+  --describe data2;
+
+  STORE data2 INTO 'output';
+
+   */
+  @Multiline
+  private String distinctByMultiComplexFieldTest;
+
+  @Test
+  public void distinctByMultiComplexFieldTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(distinctByMultiComplexFieldTest);
+
+    writeLinesToFile("input",
+                     "({(a-b,[a#0,b#1],{(a-b,0),(a-b,1)}),(a-c,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[a#1,b#0],{(a-b,1),(a-b,2)})})");
+    
+    test.runScript();
+ 
+    assertOutput(test, "data2",
+                 "({(a-b,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[b#0,a#1],{(a-b,1),(a-b,2)})})");
+  }
+
+  /**
+  register $JAR_PATH
+
+  define DistinctBy datafu.pig.bags.DistinctBy('1');
+
+  data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:CHARARRAY)});
+
+  data2 = FOREACH data GENERATE DistinctBy(data);
+
+  --describe data2;
+
+  STORE data2 INTO 'output';
+
+   */
+  @Multiline
+  private String distinctByDelimTest;
+
+  @Test
+  public void distinctByDelimTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(distinctByDelimTest);
+    
+    writeLinesToFile("input",
+                     "({(a-b,c),(a-b,d)})");
+    
+    test.runScript();
+    
+    assertOutput(test, "data2",
+                 "({(a-b,c),(a-b,d)})");
+  }
   
   @Test
   public void distinctByExecTest() throws Exception