You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by ja...@apache.org on 2014/05/16 21:53:21 UTC
git commit: DATAFU-42 Simplify BagGroup output
Repository: incubator-datafu
Updated Branches:
refs/heads/master 544e5af08 -> 75162f64d
DATAFU-42 Simplify BagGroup output
(Matthew Hayes via Jarek Jarcec Cecho)
Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/75162f64
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/75162f64
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/75162f64
Branch: refs/heads/master
Commit: 75162f64ddb60cc879b1ed67fd03bada5b9b5ae4
Parents: 544e5af
Author: Jarek Jarcec Cecho <ja...@apache.org>
Authored: Fri May 16 12:51:58 2014 -0700
Committer: Jarek Jarcec Cecho <ja...@apache.org>
Committed: Fri May 16 12:51:58 2014 -0700
----------------------------------------------------------------------
.../src/main/java/datafu/pig/bags/BagGroup.java | 73 +-
.../java/datafu/test/pig/bags/BagTests.java | 666 ++++++++++---------
.../datafu/1.2.0/datafu/pig/bags/BagGroup.html | 44 +-
.../guide/bag-operations.html.markdown.erb | 41 ++
4 files changed, 470 insertions(+), 354 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java b/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
index 2359d6f..409709c 100644
--- a/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
+++ b/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
@@ -40,22 +40,45 @@ import datafu.pig.util.AliasableEvalFunc;
/**
* Performs an in-memory group operation on a bag. The first argument is the bag.
- * The second argument is a projection of that bag to the group keys.
+ * The second argument is a projection of that bag to the keys to group by.
*
* <p>
- * Example:
- * <code>
+ * The following example groups input_bag by k. The output is a bag having tuples
+ * consisting of the group key k and a bag with the corresponding (k,v) tuples from input_bag
+ * for that key.
+ * <pre>
+ * {@code
* define BagGroup datafu.pig.bags.BagGroup();
- *
+ *
* data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
* -- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
- *
+ *
+ * -- Group input_bag by k
* data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k)) as grouped;
* -- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
* -- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
- * </code>
+ * }
+ * </pre>
+ * </p>
+ *
+ * <p>
+ * If the key k is not needed within the input_bag for the output, it can be projected
+ * out like so:
+ * <pre>
+ * {@code
+ *
+ * data3 = FOREACH data2 {
+ * -- project only the value
+ * projected = FOREACH grouped GENERATE group, input_bag.(v);
+ * GENERATE projected as grouped;
+ * }
+ *
+ * -- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+ * -- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
+ * }
+ * </pre>
* </p>
- *
+ *
* @author wvaughan
*
*/
@@ -63,14 +86,14 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
{
private final String FIELD_NAMES_PROPERTY = "FIELD_NAMES";
private List<String> fieldNames;
-
+
@Override
public Schema getOutputSchema(Schema input)
{
try {
if (input.size() != 2) {
throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %d field.", input.size()));
- }
+ }
// Expect the first field to be a bag
FieldSchema bagFieldSchema = input.getField(0);
if (bagFieldSchema.type != DataType.BAG) {
@@ -81,7 +104,7 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
if (projectedBagFieldSchema.type != DataType.BAG) {
throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %s as second field.", DataType.findTypeName(projectedBagFieldSchema.type)));
}
-
+
String bagName = bagFieldSchema.alias;
// handle named tuples
if (bagFieldSchema.schema.size() == 1) {
@@ -89,14 +112,14 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
if (bagTupleFieldSchema.type == DataType.TUPLE && bagTupleFieldSchema.alias != null) {
bagName = getPrefixedAliasName(bagName, bagTupleFieldSchema.alias);
}
- }
+ }
if (projectedBagFieldSchema.schema.size() == 1) {
FieldSchema projectedBagTupleFieldSchema = projectedBagFieldSchema.schema.getField(0);
if (projectedBagTupleFieldSchema.type == DataType.TUPLE && projectedBagTupleFieldSchema.schema != null) {
projectedBagFieldSchema = projectedBagTupleFieldSchema;
}
}
-
+
// create the output schema for the 'group'
// store the field names for the group keys
Schema groupTupleSchema = new Schema();
@@ -108,7 +131,7 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
groupTupleSchema.add(new FieldSchema(fieldSchema.alias, fieldSchema.type));
}
getInstanceProperties().put(FIELD_NAMES_PROPERTY, fieldNames);
-
+
Schema outputTupleSchema = new Schema();
if (projectedBagFieldSchema.schema.size() > 1) {
// multiple group keys
@@ -117,17 +140,17 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
// single group key
outputTupleSchema.add(new FieldSchema("group", groupTupleSchema.getField(0).type));
}
- outputTupleSchema.add(bagFieldSchema);
-
+ outputTupleSchema.add(bagFieldSchema);
+
return new Schema(new Schema.FieldSchema(
getSchemaName(this.getClass().getName().toLowerCase(), input),
- outputTupleSchema,
+ outputTupleSchema,
DataType.BAG));
} catch (FrontendException e) {
throw new RuntimeException(e);
}
}
-
+
TupleFactory tupleFactory = TupleFactory.getInstance();
BagFactory bagFactory = BagFactory.getInstance();
@@ -137,20 +160,20 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
{
Map<Tuple, List<Tuple>> groups = new HashMap<Tuple, List<Tuple>>();
fieldNames = (List<String>)getInstanceProperties().get(FIELD_NAMES_PROPERTY);
-
- DataBag inputBag = (DataBag)input.get(0);
-
+
+ DataBag inputBag = (DataBag)input.get(0);
+
for (Tuple tuple : inputBag) {
Tuple key = extractKey(tuple);
addGroup(groups, key, tuple);
}
-
+
DataBag outputBag = bagFactory.newDefaultBag();
for (Tuple key : groups.keySet()) {
Tuple outputTuple = tupleFactory.newTuple();
if (fieldNames.size() > 1) {
outputTuple.append(key);
- } else {
+ } else {
outputTuple.append(key.get(0));
}
DataBag groupBag = bagFactory.newDefaultBag();
@@ -160,10 +183,10 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
outputTuple.append(groupBag);
outputBag.add(outputTuple);
}
-
+
return outputBag;
}
-
+
private Tuple extractKey(Tuple tuple) throws ExecException {
Tuple key = tupleFactory.newTuple();
for (String field : fieldNames) {
@@ -171,7 +194,7 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
}
return key;
}
-
+
private void addGroup (Map<Tuple, List<Tuple>> groups, Tuple key, Tuple value) {
if (!groups.containsKey(key)) {
groups.put(key, new LinkedList<Tuple>());
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java b/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
index a8e0d32..6d21dbe 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
@@ -44,155 +44,155 @@ import datafu.test.pig.PigTests;
public class BagTests extends PigTests
{
/**
-
+
define NullToEmptyBag datafu.pig.bags.NullToEmptyBag();
-
+
data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-
+
dump data;
-
+
data2 = FOREACH data GENERATE NullToEmptyBag(B) as P;
-
+
dump data2;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String nullToEmptyBag;
-
+
@Test
public void nullToEmptyBagTest() throws Exception
{
PigTest test = createPigTestFromString(nullToEmptyBag);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"({(1),(2),(3),(4),(5)})",
"()",
"{(4),(5)})");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(1),(2),(3),(4),(5)})",
"({})",
"({(4),(5)})");
}
-
+
/**
-
+
define EmptyBagToNull datafu.pig.bags.EmptyBagToNull();
-
+
data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-
+
dump data;
-
+
data2 = FOREACH data GENERATE EmptyBagToNull(B) as P;
-
+
dump data2;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String emptyBagToNullTest;
-
+
@Test
public void emptyBagToNullTest() throws Exception
{
PigTest test = createPigTestFromString(emptyBagToNullTest);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"({(1),(2),(3),(4),(5)})",
"()",
"({})",
"{(4),(5)})");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(1),(2),(3),(4),(5)})",
"()",
"()",
"({(4),(5)})");
}
-
+
/**
-
+
define EmptyBagToNullFields datafu.pig.bags.EmptyBagToNullFields();
-
+
data = LOAD 'input' AS (B: bag {T: tuple(v1:INT,v2:INT)});
-
+
dump data;
-
+
data2 = FOREACH data GENERATE EmptyBagToNullFields(B) as P;
-
+
dump data2;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String emptyBagToNullFieldsTest;
-
+
@Test
public void emptyBagToNullFieldsTest() throws Exception
{
PigTest test = createPigTestFromString(emptyBagToNullFieldsTest);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"({(1,1),(2,2),(3,3),(4,4),(5,5)})",
"({})",
"{(4,4),(5,5)})");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(1,1),(2,2),(3,3),(4,4),(5,5)})",
"({(,)})",
"({(4,4),(5,5)})");
}
-
+
/**
-
+
define AppendToBag datafu.pig.bags.AppendToBag();
-
+
data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)}, T: tuple(v:INT));
-
+
data2 = FOREACH data GENERATE key, AppendToBag(B,T) as B;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String appendToBagTest;
-
+
@Test
public void appendToBagTest() throws Exception
{
PigTest test = createPigTestFromString(appendToBagTest);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"1\t{(1),(2),(3)}\t(4)",
"2\t{(10),(20),(30),(40),(50)}\t(60)");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"(1,{(1),(2),(3),(4)})",
"(2,{(10),(20),(30),(40),(50),(60)})");
}
-
+
/**
-
+
define FirstTupleFromBag datafu.pig.bags.FirstTupleFromBag();
-
+
data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)});
-
+
data2 = FOREACH data GENERATE key, FirstTupleFromBag(B, null) as B;
-
+
STORE data2 INTO 'output';
*/
@@ -212,101 +212,101 @@ public class BagTests extends PigTests
}
/**
-
+
define PrependToBag datafu.pig.bags.PrependToBag();
-
+
data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)}, T: tuple(v:INT));
-
+
data2 = FOREACH data GENERATE key, PrependToBag(B,T) as B;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String prependToBagTest;
-
+
@Test
public void prependToBagTest() throws Exception
{
PigTest test = createPigTestFromString(prependToBagTest);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"1\t{(1),(2),(3)}\t(4)",
"2\t{(10),(20),(30),(40),(50)}\t(60)");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"(1,{(4),(1),(2),(3)})",
"(2,{(60),(10),(20),(30),(40),(50)})");
}
-
+
/**
-
+
define BagConcat datafu.pig.bags.BagConcat();
-
+
data = LOAD 'input' AS (A: bag{T: tuple(v:INT)}, B: bag{T: tuple(v:INT)}, C: bag{T: tuple(v:INT)});
-
+
describe data;
-
+
data2 = FOREACH data GENERATE BagConcat(A,B,C);
-
+
describe data2;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String bagConcatTest;
-
+
@Test
public void bagConcatTest() throws Exception
{
PigTest test = createPigTestFromString(bagConcatTest);
- writeLinesToFile("input",
+ writeLinesToFile("input",
"({(1),(2),(3)}\t{(3),(5),(6)}\t{(10),(13)})",
"({(2),(3),(4)}\t{(5),(5)}\t{(20)})");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(1),(2),(3),(3),(5),(6),(10),(13)})",
"({(2),(3),(4),(5),(5),(20)})");
}
-
+
/**
-
+
define UnorderedPairs datafu.pig.bags.UnorderedPairs();
-
+
data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-
+
data2 = FOREACH data GENERATE UnorderedPairs(B) as P;
-
+
data3 = FOREACH data2 GENERATE FLATTEN(P);
-
+
data4 = FOREACH data3 GENERATE FLATTEN(elem1), FLATTEN(elem2);
-
+
data5 = ORDER data4 BY $0, $1;
-
+
STORE data5 INTO 'output';
*/
@Multiline
private String unorderedPairsTest;
-
+
@Test
public void unorderedPairsTest() throws Exception
{
PigTest test = createPigTestFromString(unorderedPairsTest);
-
+
String[] input = {
"{(1),(2),(3),(4),(5)}"
};
-
+
String[] output = {
"(1,2)",
"(1,3)",
@@ -319,37 +319,37 @@ public class BagTests extends PigTests
"(3,5)",
"(4,5)"
};
-
+
test.assertOutput("data",input,"data4",output);
}
-
+
/**
-
+
define UnorderedPairs datafu.pig.bags.UnorderedPairs();
-
+
data = LOAD 'input' AS (A:int, B: bag {T: tuple(v:INT)});
-
+
data2 = FOREACH data GENERATE A, UnorderedPairs(B) as P;
-
+
data3 = FOREACH data2 GENERATE A, FLATTEN(P);
-
+
STORE data3 INTO 'output';
*/
@Multiline
private String unorderedPairsTest2;
-
+
@Test
public void unorderedPairsTest2() throws Exception
{
PigTest test = createPigTestFromString(unorderedPairsTest2);
-
+
this.writeLinesToFile("input", "1\t{(1),(2),(3),(4),(5)}");
-
+
test.runScript();
this.getLinesForAlias(test, "data3");
-
+
this.assertOutput(test, "data3",
"(1,(1),(2))",
"(1,(1),(3))",
@@ -360,110 +360,110 @@ public class BagTests extends PigTests
"(1,(2),(5))",
"(1,(3),(4))",
"(1,(3),(5))",
- "(1,(4),(5))");
+ "(1,(4),(5))");
}
-
+
/**
-
+
define BagSplit datafu.pig.bags.BagSplit();
-
+
data = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
-
+
data2 = FOREACH data GENERATE BagSplit($MAX,B);
--describe data2;
-
+
data3 = FOREACH data2 GENERATE FLATTEN($0);
-
+
--describe data3
-
+
STORE data3 INTO 'output';
*/
@Multiline
private String bagSplitTest;
-
+
@Test
public void bagSplitTest() throws Exception
{
PigTest test = createPigTestFromString(bagSplitTest,
"MAX=5");
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}");
-
+
test.runScript();
-
+
assertOutput(test, "data3",
"({(1,11),(2,22),(3,33),(4,44),(5,55)})",
"({(6,66),(7,77),(8,88),(9,99),(10,1010)})",
"({(11,1111),(12,1212)})");
}
-
+
/**
-
+
define BagSplit datafu.pig.bags.BagSplit('true');
-
+
data = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
-
+
data2 = FOREACH data GENERATE BagSplit($MAX,B);
-
+
data3 = FOREACH data2 GENERATE FLATTEN($0);
-
+
STORE data3 INTO 'output';
*/
@Multiline
private String bagSplitWithBagNumTest;
-
+
@Test
public void bagSplitWithBagNumTest() throws Exception
{
PigTest test = createPigTestFromString(bagSplitWithBagNumTest,
"MAX=10");
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}");
-
+
test.runScript();
-
+
assertOutput(test, "data3",
"({(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010)},0)",
"({(11,1111),(12,1212)},1)");
}
-
+
/**
-
+
define Enumerate datafu.pig.bags.ReverseEnumerate('1');
-
+
data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-
+
data2 = FOREACH data GENERATE Enumerate(data);
--describe data2;
-
+
data3 = FOREACH data2 GENERATE FLATTEN($0);
--describe data3;
-
+
data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
--describe data4;
-
+
STORE data4 INTO 'output';
*/
@Multiline
private String enumerateWithReverseTest;
-
+
@Test
public void enumerateWithReverseTest() throws Exception
{
PigTest test = createPigTestFromString(enumerateWithReverseTest);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-
+
test.runScript();
-
+
assertOutput(test, "data4",
"(10,{(1),(2),(3)},5)",
"(20,{(4),(5),(6)},4)",
@@ -471,39 +471,39 @@ public class BagTests extends PigTests
"(40,{(9),(10),(11)},2)",
"(50,{(12),(13),(14),(15)},1)");
}
-
+
/**
-
+
define Enumerate datafu.pig.bags.Enumerate('1');
-
+
data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-
+
data2 = FOREACH data GENERATE Enumerate(data);
--describe data2;
-
+
data3 = FOREACH data2 GENERATE FLATTEN($0);
--describe data3;
-
+
data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
--describe data4;
-
+
STORE data4 INTO 'output';
*/
@Multiline
private String enumerateWithStartTest;
-
+
@Test
public void enumerateWithStartTest() throws Exception
{
PigTest test = createPigTestFromString(enumerateWithStartTest);
-
- writeLinesToFile("input",
+
+ writeLinesToFile("input",
"({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-
+
test.runScript();
-
+
assertOutput(test, "data4",
"(10,{(1),(2),(3)},1)",
"(20,{(4),(5),(6)},2)",
@@ -511,39 +511,39 @@ public class BagTests extends PigTests
"(40,{(9),(10),(11)},4)",
"(50,{(12),(13),(14),(15)},5)");
}
-
+
/**
-
+
define Enumerate datafu.pig.bags.Enumerate();
-
+
data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-
+
data2 = FOREACH data GENERATE Enumerate(data);
--describe data2;
-
+
data3 = FOREACH data2 GENERATE FLATTEN($0);
--describe data3;
-
+
data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
--describe data4;
-
+
STORE data4 INTO 'output';
*/
@Multiline
private String enumerateTest;
-
+
@Test
public void enumerateTest() throws Exception
{
PigTest test = createPigTestFromString(enumerateTest);
-
+
writeLinesToFile("input",
"({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-
+
test.runScript();
-
+
assertOutput(test, "data4",
"(10,{(1),(2),(3)},0)",
"(20,{(4),(5),(6)},1)",
@@ -551,18 +551,18 @@ public class BagTests extends PigTests
"(40,{(9),(10),(11)},3)",
"(50,{(12),(13),(14),(15)},4)");
}
-
+
@Test
public void enumerateTest2() throws Exception
{
PigTest test = createPigTestFromString(enumerateTest);
-
+
writeLinesToFile("input",
"({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})",
"({(11,{(11),(12),(13),(14)}),(21,{(15),(16),(17),(18)}),(31,{(19),(20)}),(41,{(21),(22),(23),(24)}),(51,{(25),(26),(27)})})");
-
+
test.runScript();
-
+
assertOutput(test, "data4",
"(10,{(1),(2),(3)},0)",
"(20,{(4),(5),(6)},1)",
@@ -574,46 +574,46 @@ public class BagTests extends PigTests
"(31,{(19),(20)},2)",
"(41,{(21),(22),(23),(24)},3)",
"(51,{(25),(26),(27)},4)");
- }
-
- /*
+ }
+
+ /*
* Testing "Accumulator" part of Enumeration by manually invoking accumulate(), getValue() and cleanup()
*/
@Test
public void enumerateAccumulatorTest() throws Exception
{
- Enumerate enumerate = new Enumerate();
-
+ Enumerate enumerate = new Enumerate();
+
Tuple tuple1 = TupleFactory.getInstance().newTuple(1);
tuple1.set(0, 10);
-
+
Tuple tuple2 = TupleFactory.getInstance().newTuple(1);
tuple2.set(0, 20);
-
+
Tuple tuple3 = TupleFactory.getInstance().newTuple(1);
tuple3.set(0, 30);
-
+
Tuple tuple4 = TupleFactory.getInstance().newTuple(1);
tuple4.set(0, 40);
-
+
Tuple tuple5 = TupleFactory.getInstance().newTuple(1);
tuple5.set(0, 50);
-
+
DataBag bag1 = BagFactory.getInstance().newDefaultBag();
bag1.add(tuple1);
bag1.add(tuple2);
bag1.add(tuple3);
-
+
DataBag bag2 = BagFactory.getInstance().newDefaultBag();
bag2.add(tuple4);
bag2.add(tuple5);
-
+
Tuple inputTuple1 = TupleFactory.getInstance().newTuple(1);
inputTuple1.set(0,bag1);
-
+
Tuple inputTuple2 = TupleFactory.getInstance().newTuple(1);
inputTuple2.set(0,bag2);
-
+
enumerate.accumulate(inputTuple1);
enumerate.accumulate(inputTuple2);
assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
@@ -622,50 +622,50 @@ public class BagTests extends PigTests
enumerate.cleanup();
enumerate.accumulate(inputTuple1);
enumerate.accumulate(inputTuple2);
- assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
+ assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
}
-
+
/**
-
+
define BagSplit datafu.pig.bags.BagSplit();
define Enumerate datafu.pig.bags.Enumerate('1');
-
+
data = LOAD 'input' AS (data: bag {T: tuple(name:CHARARRAY, score:double)});
-
+
data2 = FOREACH data GENERATE BagSplit(3,data) as the_bags;
-
+
--describe data2
-
+
data3 = FOREACH data2 GENERATE Enumerate(the_bags) as enumerated_bags;
-
+
--describe data3
-
+
data4 = FOREACH data3 GENERATE FLATTEN(enumerated_bags) as (data,i);
-
+
--describe data4
-
+
data5 = FOREACH data4 GENERATE data as the_data, i as the_key;
-
+
--describe data5
-
+
data_out = FOREACH data5 GENERATE FLATTEN(the_data), the_key;
-
+
--describe data_out
*/
@Multiline
private String comprehensiveBagSplitAndEnumerate;
-
+
@Test
public void comprehensiveBagSplitAndEnumerate() throws Exception
{
PigTest test = createPigTestFromString(comprehensiveBagSplitAndEnumerate);
-
+
writeLinesToFile("input",
"({(A,1.0),(B,2.0),(C,3.0),(D,4.0),(E,5.0)})");
-
+
test.runScript();
-
+
assertOutput(test, "data_out",
// bag #1
"(A,1.0,1)",
@@ -675,18 +675,18 @@ public class BagTests extends PigTests
"(D,4.0,2)",
"(E,5.0,2)");
}
-
+
/**
-
+
define DistinctBy datafu.pig.bags.DistinctBy('0');
-
+
data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:INT, c:INT)});
-
+
data2 = FOREACH data GENERATE DistinctBy(data);
-
+
--describe data2;
-
+
STORE data2 INTO 'output';
*/
@@ -697,18 +697,18 @@ public class BagTests extends PigTests
public void distinctByTest() throws Exception
{
PigTest test = createPigTestFromString(distinctByTest);
-
+
writeLinesToFile("input",
"({(Z,1,0),(A,1,0),(A,1,0),(B,2,0),(B,22,1),(C,3,0),(D,4,0),(E,5,0)})",
"({(A,10,2),(M,50,3),(A,34,49), (A,24,42), (Z,49,22),(B,1,1)},(B,2,2))");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})",
"({(A,10,2),(M,50,3),(Z,49,22),(B,1,1)})");
}
-
+
/**
define DistinctBy datafu.pig.bags.DistinctBy('1', '2');
@@ -732,9 +732,9 @@ public class BagTests extends PigTests
writeLinesToFile("input",
"({(a-b,[a#0,b#1],{(a-b,0),(a-b,1)}),(a-c,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[a#1,b#0],{(a-b,1),(a-b,2)})})");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(a-b,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[b#0,a#1],{(a-b,1),(a-b,2)})})");
}
@@ -759,104 +759,104 @@ public class BagTests extends PigTests
public void distinctByDelimTest() throws Exception
{
PigTest test = createPigTestFromString(distinctByDelimTest);
-
+
writeLinesToFile("input",
"({(a-b,c),(a-b,d)})");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"({(a-b,c),(a-b,d)})");
}
-
+
@Test
public void distinctByExecTest() throws Exception
{
DistinctBy distinct = new DistinctBy("0");
-
+
DataBag bag;
Tuple input;
Tuple data;
-
+
bag = BagFactory.getInstance().newDefaultBag();
input = TupleFactory.getInstance().newTuple(bag);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 10);
data.set(1, 20);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 11);
data.set(1, 50);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 10);
data.set(1, 22);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 12);
data.set(1, 40);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 11);
data.set(1, 50);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 11);
data.set(1, 51);
-
+
DataBag result = distinct.exec(input);
-
+
Assert.assertEquals(3, result.size());
-
+
Iterator<Tuple> iter = result.iterator();
Assert.assertEquals("(10,20)", iter.next().toString());
Assert.assertEquals("(11,50)", iter.next().toString());
Assert.assertEquals("(12,40)", iter.next().toString());
-
+
// do it again to test cleanup
bag = BagFactory.getInstance().newDefaultBag();
input = TupleFactory.getInstance().newTuple(bag);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 12);
data.set(1, 42);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 11);
data.set(1, 51);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag.add(data);
data.set(0, 11);
data.set(1, 50);
-
+
result = distinct.exec(input);
-
+
Assert.assertEquals(2, result.size());
-
+
iter = result.iterator();
Assert.assertEquals("(12,42)", iter.next().toString());
Assert.assertEquals("(11,51)", iter.next().toString());
}
-
+
@Test
public void distinctByAccumulateTest() throws Exception
{
DistinctBy distinct = new DistinctBy("0");
-
+
DataBag bag;
Tuple input;
Tuple data;
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -864,7 +864,7 @@ public class BagTests extends PigTests
data.set(0, 10);
data.set(1, 20);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -872,7 +872,7 @@ public class BagTests extends PigTests
data.set(0, 11);
data.set(1, 50);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -880,7 +880,7 @@ public class BagTests extends PigTests
data.set(0, 10);
data.set(1, 22);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -888,7 +888,7 @@ public class BagTests extends PigTests
data.set(0, 12);
data.set(1, 40);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -896,7 +896,7 @@ public class BagTests extends PigTests
data.set(0, 11);
data.set(1, 50);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -904,19 +904,19 @@ public class BagTests extends PigTests
data.set(0, 11);
data.set(1, 51);
distinct.accumulate(input);
-
+
DataBag result = distinct.getValue();
-
+
Assert.assertEquals(3, result.size());
-
+
Iterator<Tuple> iter = result.iterator();
Assert.assertEquals("(10,20)", iter.next().toString());
Assert.assertEquals("(11,50)", iter.next().toString());
Assert.assertEquals("(12,40)", iter.next().toString());
-
+
// do it again to test cleanup
distinct.cleanup();
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -924,7 +924,7 @@ public class BagTests extends PigTests
data.set(0, 12);
data.set(1, 42);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -932,7 +932,7 @@ public class BagTests extends PigTests
data.set(0, 11);
data.set(1, 51);
distinct.accumulate(input);
-
+
data = TupleFactory.getInstance().newTuple(2);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(data);
@@ -940,98 +940,98 @@ public class BagTests extends PigTests
data.set(0, 11);
data.set(1, 50);
distinct.accumulate(input);
-
+
result = distinct.getValue();
-
+
Assert.assertEquals(2, result.size());
-
+
iter = result.iterator();
Assert.assertEquals("(12,42)", iter.next().toString());
Assert.assertEquals("(11,51)", iter.next().toString());
}
-
+
/**
-
+
define CountEach datafu.pig.bags.CountEach();
-
+
data = LOAD 'input' AS (data: bag {T: tuple(v1:chararray)});
-
+
data2 = FOREACH data GENERATE CountEach(data) as counted;
--describe data2;
-
+
data3 = FOREACH data2 {
ordered = ORDER counted BY count DESC;
GENERATE ordered;
}
--describe data3
-
+
STORE data3 INTO 'output';
*/
@Multiline
private String countEachTest;
-
- @Test
+
+ @Test
public void countEachTest() throws Exception
{
PigTest test = createPigTestFromString(countEachTest);
- writeLinesToFile("input",
+ writeLinesToFile("input",
"({(A),(B),(A),(C),(A),(B)})");
-
+
test.runScript();
-
+
assertOutput(test, "data3",
"({((A),3),((B),2),((C),1)})");
}
-
- @Test
+
+ @Test
public void countEachExecAndAccumulateTest() throws Exception
- {
+ {
for (int c=0; c<2; c++)
{
CountEach countEach = new CountEach("flatten");
-
+
DataBag bag = BagFactory.getInstance().newDefaultBag();
- {
+ {
Tuple t = TupleFactory.getInstance().newTuple(1);
t.set(0, "A");
bag.add(t);
}
- {
+ {
Tuple t = TupleFactory.getInstance().newTuple(1);
t.set(0, "B");
bag.add(t);
}
- {
+ {
Tuple t = TupleFactory.getInstance().newTuple(1);
t.set(0, "B");
bag.add(t);
}
- {
+ {
Tuple t = TupleFactory.getInstance().newTuple(1);
t.set(0, "C");
bag.add(t);
}
- {
+ {
Tuple t = TupleFactory.getInstance().newTuple(1);
t.set(0, "A");
bag.add(t);
}
- {
+ {
Tuple t = TupleFactory.getInstance().newTuple(1);
t.set(0, "D");
bag.add(t);
}
-
+
DataBag output = null;
-
+
if (c == 0)
{
Tuple input = TupleFactory.getInstance().newTuple(1);
input.set(0, bag);
-
+
System.out.println("Testing exec");
output = countEach.exec(input);
}
@@ -1046,21 +1046,21 @@ public class BagTests extends PigTests
input.set(0, tb);
countEach.accumulate(input);
}
-
+
output = countEach.getValue();
-
- countEach.cleanup();
+
+ countEach.cleanup();
Assert.assertEquals(0, countEach.getValue().size());
}
-
+
System.out.println(output.toString());
-
+
Assert.assertEquals(4, output.size());
Set<String> found = new HashSet<String>();
for (Tuple t : output)
{
- String key = (String)t.get(0);
- found.add(key);
+ String key = (String)t.get(0);
+ found.add(key);
if (key == "A")
{
Assert.assertEquals(2, t.get(1));
@@ -1085,94 +1085,94 @@ public class BagTests extends PigTests
Assert.assertEquals(4, found.size());
}
}
-
+
/**
-
+
define CountEach datafu.pig.bags.CountEach('flatten');
-
+
data = LOAD 'input' AS (data: bag {T: tuple(v1:chararray)});
-
+
data2 = FOREACH data GENERATE CountEach(data) as counted;
--describe data2;
-
+
data3 = FOREACH data2 {
ordered = ORDER counted BY count DESC;
GENERATE ordered;
}
--describe data3
-
+
STORE data3 INTO 'output';
*/
@Multiline
private String countEachFlattenTest;
-
- @Test
+
+ @Test
public void countEachFlattenTest() throws Exception
{
PigTest test = createPigTestFromString(countEachFlattenTest);
- writeLinesToFile("input",
+ writeLinesToFile("input",
"({(A),(B),(A),(C),(A),(B)})");
-
+
test.runScript();
-
+
assertOutput(test, "data3",
"({(A,3),(B,2),(C,1)})");
}
-
+
/**
-
+
define BagLeftOuterJoin datafu.pig.bags.BagLeftOuterJoin();
-
+
data = LOAD 'input' AS (outer_key:chararray, bag1:bag{T:tuple(k:chararray,v:chararray)}, bag2:bag{T:tuple(k:chararray,v:chararray)}, bag3:bag{T:tuple(k3:chararray,v3:chararray)});
describe data;
-
- data2 = FOREACH data GENERATE
- outer_key,
+
+ data2 = FOREACH data GENERATE
+ outer_key,
BagLeftOuterJoin(bag1, 'k', bag2, 'k', bag3, 'k3') as joined1,
BagLeftOuterJoin(bag1, 'k', bag3, 'k3', bag2, 'k') as joined2; --this will break without UDF signature and pig < 0.11
describe data2;
-
+
STORE data2 INTO 'output';
*/
@Multiline
private String bagLeftOuterJoinTest;
-
- @Test
+
+ @Test
public void bagLeftOuterJoinTest() throws Exception
{
PigTest test = createPigTestFromString(bagLeftOuterJoinTest);
- writeLinesToFile("input",
+ writeLinesToFile("input",
"1\t{(K1,A1),(K2,B1),(K3,C1)}\t{(K1,A2),(K2,B2),(K2,B22)}\t{(K1,A3),(K3,C3),(K4,D3)}");
-
+
test.runScript();
-
+
assertOutput(test, "data2",
"(1,{(K1,A1,K1,A2,K1,A3),(K2,B1,K2,B2,,),(K2,B1,K2,B22,,),(K3,C1,,,K3,C3)},{(K1,A1,K1,A3,K1,A2),(K2,B1,,,K2,B2),(K2,B1,,,K2,B22),(K3,C1,K3,C3,,)})");
}
-
+
/**
-
+
define BagUnion datafu.pig.bags.BagConcat();
-
+
data = LOAD 'input' AS (input_bag: bag {T: tuple(inner_bag: bag {T2: tuple(k: int, v: chararray)})});
describe data;
-
+
data2 = FOREACH data GENERATE BagUnion(input_bag) as unioned;
describe data2;
-
+
STORE data INTO 'output';
*/
@Multiline
private String bagUnionTest;
-
+
@Test
public void bagUnionTest() throws Exception
{
@@ -1181,67 +1181,105 @@ public class BagTests extends PigTests
test.runScript();
assertOutput(test, "data2", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})");
}
-
+
/**
-
+
define BagGroup datafu.pig.bags.BagGroup();
-
+
data = LOAD 'input' AS (input_bag: bag {T: tuple(k: chararray, v: chararray)});
describe data;
-
+
data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.k) as grouped;
describe data2;
-
+
data3 = FOREACH data2 {
ordered = ORDER grouped BY group;
GENERATE
ordered as grouped;
}
describe data3;
-
+
STORE data INTO 'output';
*/
@Multiline
private String bagGroupSingleTest;
-
+
@Test
public void bagGroupSingleTest() throws Exception
{
PigTest test = createPigTestFromString(bagGroupSingleTest);
writeLinesToFile("input", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})",
"({(A,1),(B,1),(A,2),(B,2),(C,2),(A,3)})");
- test.runScript();
+ test.runScript();
getLinesForAlias(test, "data2", true);
assertOutput(test, "data3", "({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})",
"({(A,{(A,1),(A,2),(A,3)}),(B,{(B,1),(B,2)}),(C,{(C,2)})})");
}
-
+
/**
-
+
define BagGroup datafu.pig.bags.BagGroup();
-
+
+ data = LOAD 'input' AS (input_bag: bag {T: tuple(k: chararray, v: chararray)});
+ describe data;
+
+ data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.k) as grouped;
+ describe data2;
+
+ data3 = FOREACH data2 {
+ ordered = ORDER grouped BY group;
+ -- project only the value
+ projected = FOREACH ordered GENERATE group, input_bag.(v);
+ GENERATE
+ projected as grouped;
+ }
+ describe data3;
+
+ STORE data INTO 'output';
+
+ */
+ @Multiline
+ private String bagGroupProjectTest;
+
+ @Test(description="BagGroup with projection of the value from the bag")
+ public void bagGroupProjectTest() throws Exception
+ {
+ PigTest test = createPigTestFromString(bagGroupProjectTest);
+ writeLinesToFile("input", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})",
+ "({(A,1),(B,1),(A,2),(B,2),(C,2),(A,3)})");
+ test.runScript();
+ getLinesForAlias(test, "data2", true);
+ assertOutput(test, "data3", "({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})",
+ "({(A,{(1),(2),(3)}),(B,{(1),(2)}),(C,{(2)})})");
+ }
+
+ /**
+
+
+ define BagGroup datafu.pig.bags.BagGroup();
+
data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, k2: chararray, v: int)});
describe data;
-
+
data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k, k2)) as grouped;
describe data2;
-
+
data3 = FOREACH data2 {
ordered = ORDER grouped BY group;
GENERATE
ordered as grouped;
}
describe data3;
-
+
STORE data INTO 'output';
*/
@Multiline
private String bagGroupMultipleTest;
-
+
@Test
public void bagGroupMultipleTest() throws Exception
{
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
----------------------------------------------------------------------
diff --git a/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html b/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
index 2f27539..35c2557 100644
--- a/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
+++ b/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
@@ -2,12 +2,12 @@
<!--NewPage-->
<HTML>
<HEAD>
-<!-- Generated by javadoc (build 1.6.0_27) on Thu Dec 12 19:05:53 PST 2013 -->
+<!-- Generated by javadoc (build 1.6.0_27) on Mon Apr 28 10:08:50 PDT 2014 -->
<TITLE>
BagGroup (DataFu 1.2.0)
</TITLE>
-<META NAME="date" CONTENT="2013-12-12">
+<META NAME="date" CONTENT="2014-04-28">
<LINK REL ="stylesheet" TYPE="text/css" HREF="../../../stylesheet.css" TITLE="Style">
@@ -40,7 +40,6 @@ function windowTitle()
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-summary.html"><FONT CLASS="NavBarFont1"><B>Package</B></FONT></A> </TD>
<TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Class</B></FONT> </TD>
- <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="class-use/BagGroup.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD>
@@ -105,28 +104,44 @@ java.lang.Object
<P>
Performs an in-memory group operation on a bag. The first argument is the bag.
- The second argument is a projection of that bag to the group keys.
+ The second argument is a projection of that bag to the keys to group by.
<p>
- Example:
- <code>
- define BagGroup datafu.pig.bags.BagGroup();
-
+ The following example groups input_bag by k. The output is a bag having tuples
+ consisting of the group key k and a bag with the corresponding (k,v) tuples from input_bag
+ for that key.
+ <pre>
+ <code>define BagGroup datafu.pig.bags.BagGroup();
+
data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
-- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
-
+
+ -- Group input_bag by k
data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k)) as grouped;
-- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
-- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
</code>
+ </pre>
+ </p>
+
+ <p>
+ If the key k is not needed within the input_bag for the output, it can be projected
+ out like so:
+ <pre>
+ <code>data3 = FOREACH data2 {
+ -- project only the value
+ projected = FOREACH grouped GENERATE group, input_bag.(v);
+ GENERATE projected as grouped;
+ }
+
+ -- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+ -- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
+ </code>
+ </pre>
</p>
<P>
<P>
-<DL>
-<DT><B>Author:</B></DT>
- <DD>wvaughan</DD>
-</DL>
<HR>
<P>
@@ -305,7 +320,6 @@ public org.apache.pig.data.DataBag <B>exec</B>(org.apache.pig.data.Tuple in
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-summary.html"><FONT CLASS="NavBarFont1"><B>Package</B></FONT></A> </TD>
<TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Class</B></FONT> </TD>
- <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="class-use/BagGroup.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD>
<TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD>
@@ -350,6 +364,6 @@ DETAIL: FIELD | <A HREF="#constructor_detail">CONSTR</A> |&n
<!-- ======== END OF BOTTOM NAVBAR ======= -->
<HR>
-Matthew Hayes, Sam Shah
+
</BODY>
</HTML>
http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
----------------------------------------------------------------------
diff --git a/site/source/docs/datafu/guide/bag-operations.html.markdown.erb b/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
index 70d73c6..799f438 100644
--- a/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
+++ b/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
@@ -71,6 +71,47 @@ input = LOAD 'input' AS (A: bag{T: tuple(bag{T2: tuple(v:INT)})});
output = FOREACH input GENERATE BagConcat(A);
```
+### Grouping Within a Bag
+
+Pig has a `GROUP` operation that can be applied to a relation. It produces a new relation where the input
+tuples are grouped by a particular key. A bag in the relation contains the grouped tuples for that key. The key
+is represented by a `group` parameter.
+
+[BagGroup](/docs/datafu/<%= current_page.data.version %>/datafu/pig/bags/BagGroup.html) mimics the `GROUP`
+operation from Pig. The difference between them is that `BagGroup` operates within a bag, rather than on a relation.
+This can be useful when operating on bags that are not very large in size. We can operate on the tuples within
+this bag without involving `GROUP`, which would result in another MapReduce job. With `BagGroup` we can avoid this.
+
+In the following example we take an `input_bag` consisting of key-value pairs `(k,v)` and group the tuples by `k`.
+This produces a new bag having tuples consisting of `group` and `input_bag`. The `group` corresponds to the grouping
+key `k`. The `input_bag` is a bag containing the tuples from the original `input_bag` that have the same `k` as `group`.
+
+```pig
+ define BagGroup datafu.pig.bags.BagGroup();
+
+ data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
+ -- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
+
+ -- Group input_bag by k
+ data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k)) as grouped;
+ -- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+ -- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
+```
+
+We could also project out the key from the final `input_bag` using a nested `FOREACH` so that the bag only
+consists of the value `v`:
+
+```pig
+data3 = FOREACH data2 {
+ -- project only the value
+ projected = FOREACH grouped GENERATE group, input_bag.(v);
+ GENERATE projected as grouped;
+}
+
+-- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+-- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
+```
+
### Append to Bag
[AppendToBag](/docs/datafu/<%= current_page.data.version %>/datafu/pig/bags/AppendToBag.html) can be