You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2014/02/18 22:18:43 UTC
git commit: DATAFU-31 DistinctBy works incorrectly on string
containing minuses
Repository: incubator-datafu
Updated Branches:
refs/heads/master 16a82e8b4 -> d4a5c5d43
DATAFU-31 DistinctBy works incorrectly on string containing minuses
https://issues.apache.org/jira/browse/DATAFU-31
Signed-off-by: Matt Hayes <mh...@linkedin.com>
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/d4a5c5d4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/d4a5c5d4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/d4a5c5d4
Branch: refs/heads/master
Commit: d4a5c5d434c33c6b614ae08ee4179661fda0d358
Parents: 16a82e8
Author: Jian J. Wang <wj...@dogfavorshot-lm.peking.corp.yahoo.com>
Authored: Sun Feb 16 20:58:02 2014 +0800
Committer: Matt Hayes <mh...@linkedin.com>
Committed: Tue Feb 18 13:18:06 2014 -0800
----------------------------------------------------------------------
src/java/datafu/pig/bags/DistinctBy.java | 25 +++++-----
test/pig/datafu/test/pig/bags/BagTests.java | 62 ++++++++++++++++++++++++
2 files changed, 74 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/d4a5c5d4/src/java/datafu/pig/bags/DistinctBy.java
----------------------------------------------------------------------
diff --git a/src/java/datafu/pig/bags/DistinctBy.java b/src/java/datafu/pig/bags/DistinctBy.java
index 0bbb0e6..a79e4de 100644
--- a/src/java/datafu/pig/bags/DistinctBy.java
+++ b/src/java/datafu/pig/bags/DistinctBy.java
@@ -28,6 +28,7 @@ import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
+import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
@@ -60,9 +61,8 @@ import org.apache.pig.impl.logicalLayer.schema.Schema;
*/
public class DistinctBy extends AccumulatorEvalFunc<DataBag>
{
- private final static String delimiter = "-";
private HashSet<Integer> fields = new HashSet<Integer>();
- private HashSet<String> seen = new HashSet<String>();
+ private HashSet<Tuple> seen = new HashSet<Tuple>();
private DataBag outputBag;
public DistinctBy(String... fields)
@@ -85,10 +85,10 @@ public class DistinctBy extends AccumulatorEvalFunc<DataBag>
DataBag inputBag = (DataBag)input.get(0);
for (Tuple t : inputBag) {
- String distinctString = getDistinctString(t, this.fields);
- if (!seen.contains(distinctString)) {
+ Tuple distinctFieldTuple = getDistinctFieldTuple(t, this.fields);
+ if (!seen.contains(distinctFieldTuple)) {
outputBag.add(t);
- seen.add(distinctString);
+ seen.add(distinctFieldTuple);
}
}
}
@@ -147,17 +147,16 @@ public class DistinctBy extends AccumulatorEvalFunc<DataBag>
}
}
- private String getDistinctString(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException {
- String[] tokens = t.toDelimitedString(delimiter).split(delimiter);
- StringBuffer buffer = new StringBuffer();
- for(int i=0; i<tokens.length; i++) {
+ private Tuple getDistinctFieldTuple(Tuple t, HashSet<Integer> distinctFieldPositions) throws ExecException {
+ Tuple fieldTuple = TupleFactory.getInstance().newTuple(distinctFieldPositions.size());
+ int idx = 0;
+ for(int i=0; i<t.size(); i++) {
if (distinctFieldPositions.contains(i)) {
- buffer.append(tokens[i]);
- buffer.append(delimiter);
+ fieldTuple.set(idx, t.get(i));
+ idx++;
}
}
- buffer.substring(0, buffer.length() - delimiter.length());
- return buffer.toString();
+ return fieldTuple;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/d4a5c5d4/test/pig/datafu/test/pig/bags/BagTests.java
----------------------------------------------------------------------
diff --git a/test/pig/datafu/test/pig/bags/BagTests.java b/test/pig/datafu/test/pig/bags/BagTests.java
index c9e3f63..80bb0cc 100644
--- a/test/pig/datafu/test/pig/bags/BagTests.java
+++ b/test/pig/datafu/test/pig/bags/BagTests.java
@@ -708,6 +708,68 @@ public class BagTests extends PigTests
"({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})",
"({(A,10,2),(M,50,3),(Z,49,22),(B,1,1)})");
}
+
+ /**
+ register $JAR_PATH
+
+ define DistinctBy datafu.pig.bags.DistinctBy('1', '2');
+
+ data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:map[INT], c:bag{t: tuple(c0:CHARARRAY, c1:INT)})});
+
+ data2 = FOREACH data GENERATE DistinctBy(data);
+
+ --describe data2;
+
+ STORE data2 INTO 'output';
+
+ */
+ @Multiline
+ private String distinctByMultiComplexFieldTest;
+
+ @Test
+ public void distinctByMultiComplexFieldTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(distinctByMultiComplexFieldTest);
+
+ writeLinesToFile("input",
+ "({(a-b,[a#0,b#1],{(a-b,0),(a-b,1)}),(a-c,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[a#1,b#0],{(a-b,1),(a-b,2)})})");
+
+ test.runScript();
+
+ assertOutput(test, "data2",
+ "({(a-b,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[b#0,a#1],{(a-b,1),(a-b,2)})})");
+ }
+
+ /**
+ register $JAR_PATH
+
+ define DistinctBy datafu.pig.bags.DistinctBy('1');
+
+ data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:CHARARRAY)});
+
+ data2 = FOREACH data GENERATE DistinctBy(data);
+
+ --describe data2;
+
+ STORE data2 INTO 'output';
+
+ */
+ @Multiline
+ private String distinctByDelimTest;
+
+ @Test
+ public void distinctByDelimTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(distinctByDelimTest);
+
+ writeLinesToFile("input",
+ "({(a-b,c),(a-b,d)})");
+
+ test.runScript();
+
+ assertOutput(test, "data2",
+ "({(a-b,c),(a-b,d)})");
+ }
@Test
public void distinctByExecTest() throws Exception