You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by ja...@apache.org on 2014/05/16 21:53:21 UTC

git commit: DATAFU-42 Simplify BagGroup output

Repository: incubator-datafu
Updated Branches:
  refs/heads/master 544e5af08 -> 75162f64d


DATAFU-42 Simplify BagGroup output

(Matthew Hayes via Jarek Jarcec Cecho)


Project: http://git-wip-us.apache.org/repos/asf/incubator-datafu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-datafu/commit/75162f64
Tree: http://git-wip-us.apache.org/repos/asf/incubator-datafu/tree/75162f64
Diff: http://git-wip-us.apache.org/repos/asf/incubator-datafu/diff/75162f64

Branch: refs/heads/master
Commit: 75162f64ddb60cc879b1ed67fd03bada5b9b5ae4
Parents: 544e5af
Author: Jarek Jarcec Cecho <ja...@apache.org>
Authored: Fri May 16 12:51:58 2014 -0700
Committer: Jarek Jarcec Cecho <ja...@apache.org>
Committed: Fri May 16 12:51:58 2014 -0700

----------------------------------------------------------------------
 .../src/main/java/datafu/pig/bags/BagGroup.java |  73 +-
 .../java/datafu/test/pig/bags/BagTests.java     | 666 ++++++++++---------
 .../datafu/1.2.0/datafu/pig/bags/BagGroup.html  |  44 +-
 .../guide/bag-operations.html.markdown.erb      |  41 ++
 4 files changed, 470 insertions(+), 354 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java b/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
index 2359d6f..409709c 100644
--- a/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
+++ b/datafu-pig/src/main/java/datafu/pig/bags/BagGroup.java
@@ -40,22 +40,45 @@ import datafu.pig.util.AliasableEvalFunc;
 
 /**
  * Performs an in-memory group operation on a bag.  The first argument is the bag.
- * The second argument is a projection of that bag to the group keys.
+ * The second argument is a projection of that bag to the keys to group by.
  *
  * <p>
- * Example:
- * <code>
+ * The following example groups input_bag by k.  The output is a bag having tuples
+ * consisting of the group key k and a bag with the corresponding (k,v) tuples from input_bag
+ * for that key.
+ * <pre>
+ * {@code
  * define BagGroup datafu.pig.bags.BagGroup();
- * 
+ *
  * data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
  * -- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
- * 
+ *
+ * -- Group input_bag by k
  * data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k)) as grouped;
  * -- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
  * -- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
- * </code>
+ * }
+ * </pre>
+ * </p>
+ *
+ * <p>
+ * If the key k is not needed within the input_bag for the output, it can be projected
+ * out like so:
+ * <pre>
+ * {@code
+ *
+ * data3 = FOREACH data2 {
+ *   -- project only the value
+ *   projected = FOREACH grouped GENERATE group, input_bag.(v);
+ *   GENERATE projected as grouped;
+ * }
+ *
+ * -- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+ * -- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
+ * }
+ * </pre>
  * </p>
- * 
+ *
  * @author wvaughan
  *
  */
@@ -63,14 +86,14 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
 {
   private final String FIELD_NAMES_PROPERTY = "FIELD_NAMES";
   private List<String> fieldNames;
-  
+
   @Override
   public Schema getOutputSchema(Schema input)
   {
     try {
       if (input.size() != 2) {
         throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %d field.", input.size()));
-      }      
+      }
       // Expect the first field to be a bag
       FieldSchema bagFieldSchema = input.getField(0);
       if (bagFieldSchema.type != DataType.BAG) {
@@ -81,7 +104,7 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
       if (projectedBagFieldSchema.type != DataType.BAG) {
         throw new RuntimeException(String.format("Expected input of format (BAG, PROJECTED_BAG...). Got %s as second field.", DataType.findTypeName(projectedBagFieldSchema.type)));
       }
-      
+
       String bagName = bagFieldSchema.alias;
       // handle named tuples
       if (bagFieldSchema.schema.size() == 1) {
@@ -89,14 +112,14 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
         if (bagTupleFieldSchema.type == DataType.TUPLE && bagTupleFieldSchema.alias != null) {
           bagName = getPrefixedAliasName(bagName, bagTupleFieldSchema.alias);
         }
-      }      
+      }
       if (projectedBagFieldSchema.schema.size() == 1) {
         FieldSchema projectedBagTupleFieldSchema = projectedBagFieldSchema.schema.getField(0);
         if (projectedBagTupleFieldSchema.type == DataType.TUPLE && projectedBagTupleFieldSchema.schema != null) {
           projectedBagFieldSchema = projectedBagTupleFieldSchema;
         }
       }
-      
+
       // create the output schema for the 'group'
       // store the field names for the group keys
       Schema groupTupleSchema = new Schema();
@@ -108,7 +131,7 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
         groupTupleSchema.add(new FieldSchema(fieldSchema.alias, fieldSchema.type));
       }
       getInstanceProperties().put(FIELD_NAMES_PROPERTY, fieldNames);
-      
+
       Schema outputTupleSchema = new Schema();
       if (projectedBagFieldSchema.schema.size() > 1) {
         // multiple group keys
@@ -117,17 +140,17 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
         // single group key
         outputTupleSchema.add(new FieldSchema("group", groupTupleSchema.getField(0).type));
       }
-      outputTupleSchema.add(bagFieldSchema);      
-      
+      outputTupleSchema.add(bagFieldSchema);
+
       return new Schema(new Schema.FieldSchema(
             getSchemaName(this.getClass().getName().toLowerCase(), input),
-            outputTupleSchema, 
+            outputTupleSchema,
             DataType.BAG));
     } catch (FrontendException e) {
       throw new RuntimeException(e);
     }
   }
-  
+
   TupleFactory tupleFactory = TupleFactory.getInstance();
   BagFactory bagFactory = BagFactory.getInstance();
 
@@ -137,20 +160,20 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
   {
     Map<Tuple, List<Tuple>> groups = new HashMap<Tuple, List<Tuple>>();
     fieldNames = (List<String>)getInstanceProperties().get(FIELD_NAMES_PROPERTY);
-    
-    DataBag inputBag = (DataBag)input.get(0);    
-    
+
+    DataBag inputBag = (DataBag)input.get(0);
+
     for (Tuple tuple : inputBag) {
       Tuple key = extractKey(tuple);
       addGroup(groups, key, tuple);
     }
-    
+
     DataBag outputBag = bagFactory.newDefaultBag();
     for (Tuple key : groups.keySet()) {
       Tuple outputTuple = tupleFactory.newTuple();
       if (fieldNames.size() > 1) {
         outputTuple.append(key);
-      } else {        
+      } else {
         outputTuple.append(key.get(0));
       }
       DataBag groupBag = bagFactory.newDefaultBag();
@@ -160,10 +183,10 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
       outputTuple.append(groupBag);
       outputBag.add(outputTuple);
     }
-    
+
     return outputBag;
   }
-  
+
   private Tuple extractKey(Tuple tuple) throws ExecException {
     Tuple key = tupleFactory.newTuple();
     for (String field : fieldNames) {
@@ -171,7 +194,7 @@ public class BagGroup extends AliasableEvalFunc<DataBag>
     }
     return key;
   }
-  
+
   private void addGroup (Map<Tuple, List<Tuple>> groups, Tuple key, Tuple value) {
     if (!groups.containsKey(key)) {
       groups.put(key, new LinkedList<Tuple>());

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
----------------------------------------------------------------------
diff --git a/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java b/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
index a8e0d32..6d21dbe 100644
--- a/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
+++ b/datafu-pig/src/test/java/datafu/test/pig/bags/BagTests.java
@@ -44,155 +44,155 @@ import datafu.test.pig.PigTests;
 public class BagTests extends PigTests
 {
   /**
-  
+
 
   define NullToEmptyBag datafu.pig.bags.NullToEmptyBag();
-  
+
   data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-  
+
   dump data;
-  
+
   data2 = FOREACH data GENERATE NullToEmptyBag(B) as P;
-  
+
   dump data2;
-  
+
   STORE data2 INTO 'output';
    */
   @Multiline
   private String nullToEmptyBag;
-  
+
   @Test
   public void nullToEmptyBagTest() throws Exception
   {
     PigTest test = createPigTestFromString(nullToEmptyBag);
-            
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "({(1),(2),(3),(4),(5)})",
                      "()",
                      "{(4),(5)})");
-            
+
     test.runScript();
-        
+
     assertOutput(test, "data2",
                  "({(1),(2),(3),(4),(5)})",
                  "({})",
                  "({(4),(5)})");
   }
-  
+
   /**
-  
+
 
   define EmptyBagToNull datafu.pig.bags.EmptyBagToNull();
-  
+
   data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-  
+
   dump data;
-  
+
   data2 = FOREACH data GENERATE EmptyBagToNull(B) as P;
-  
+
   dump data2;
-  
+
   STORE data2 INTO 'output';
    */
   @Multiline
   private String emptyBagToNullTest;
-  
+
   @Test
   public void emptyBagToNullTest() throws Exception
   {
     PigTest test = createPigTestFromString(emptyBagToNullTest);
-            
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "({(1),(2),(3),(4),(5)})",
                      "()",
                      "({})",
                      "{(4),(5)})");
-            
+
     test.runScript();
-        
+
     assertOutput(test, "data2",
                  "({(1),(2),(3),(4),(5)})",
                  "()",
                  "()",
                  "({(4),(5)})");
   }
-  
+
   /**
-  
+
 
   define EmptyBagToNullFields datafu.pig.bags.EmptyBagToNullFields();
-  
+
   data = LOAD 'input' AS (B: bag {T: tuple(v1:INT,v2:INT)});
-  
+
   dump data;
-  
+
   data2 = FOREACH data GENERATE EmptyBagToNullFields(B) as P;
-  
+
   dump data2;
-  
+
   STORE data2 INTO 'output';
    */
   @Multiline
   private String emptyBagToNullFieldsTest;
-  
+
   @Test
   public void emptyBagToNullFieldsTest() throws Exception
   {
     PigTest test = createPigTestFromString(emptyBagToNullFieldsTest);
-            
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "({(1,1),(2,2),(3,3),(4,4),(5,5)})",
                      "({})",
                      "{(4,4),(5,5)})");
-            
+
     test.runScript();
-        
+
     assertOutput(test, "data2",
                  "({(1,1),(2,2),(3,3),(4,4),(5,5)})",
                  "({(,)})",
                  "({(4,4),(5,5)})");
   }
-  
+
   /**
-  
+
 
   define AppendToBag datafu.pig.bags.AppendToBag();
-  
+
   data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)}, T: tuple(v:INT));
-  
+
   data2 = FOREACH data GENERATE key, AppendToBag(B,T) as B;
-  
+
   STORE data2 INTO 'output';
 
    */
   @Multiline
   private String appendToBagTest;
-  
+
   @Test
   public void appendToBagTest() throws Exception
   {
     PigTest test = createPigTestFromString(appendToBagTest);
-    
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "1\t{(1),(2),(3)}\t(4)",
                      "2\t{(10),(20),(30),(40),(50)}\t(60)");
-                  
+
     test.runScript();
-            
+
     assertOutput(test, "data2",
                  "(1,{(1),(2),(3),(4)})",
                  "(2,{(10),(20),(30),(40),(50),(60)})");
   }
-  
+
   /**
-  
+
 
   define FirstTupleFromBag datafu.pig.bags.FirstTupleFromBag();
-  
+
   data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)});
-  
+
   data2 = FOREACH data GENERATE key, FirstTupleFromBag(B, null) as B;
-  
+
   STORE data2 INTO 'output';
 
    */
@@ -212,101 +212,101 @@ public class BagTests extends PigTests
   }
 
   /**
-  
+
 
   define PrependToBag datafu.pig.bags.PrependToBag();
-  
+
   data = LOAD 'input' AS (key:INT, B: bag{T: tuple(v:INT)}, T: tuple(v:INT));
-  
+
   data2 = FOREACH data GENERATE key, PrependToBag(B,T) as B;
-  
+
   STORE data2 INTO 'output';
 
    */
   @Multiline
   private String prependToBagTest;
-  
+
   @Test
   public void prependToBagTest() throws Exception
   {
     PigTest test = createPigTestFromString(prependToBagTest);
-    
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "1\t{(1),(2),(3)}\t(4)",
                      "2\t{(10),(20),(30),(40),(50)}\t(60)");
-                  
+
     test.runScript();
-            
+
     assertOutput(test, "data2",
                  "(1,{(4),(1),(2),(3)})",
                  "(2,{(60),(10),(20),(30),(40),(50)})");
   }
-  
+
   /**
-  
+
 
   define BagConcat datafu.pig.bags.BagConcat();
-  
+
   data = LOAD 'input' AS (A: bag{T: tuple(v:INT)}, B: bag{T: tuple(v:INT)}, C: bag{T: tuple(v:INT)});
-  
+
   describe data;
-  
+
   data2 = FOREACH data GENERATE BagConcat(A,B,C);
-  
+
   describe data2;
-  
+
   STORE data2 INTO 'output';
    */
   @Multiline
   private String bagConcatTest;
-  
+
   @Test
   public void bagConcatTest() throws Exception
   {
     PigTest test = createPigTestFromString(bagConcatTest);
 
-    writeLinesToFile("input", 
+    writeLinesToFile("input",
                      "({(1),(2),(3)}\t{(3),(5),(6)}\t{(10),(13)})",
                      "({(2),(3),(4)}\t{(5),(5)}\t{(20)})");
-                  
+
     test.runScript();
-            
+
     assertOutput(test, "data2",
                  "({(1),(2),(3),(3),(5),(6),(10),(13)})",
                  "({(2),(3),(4),(5),(5),(20)})");
   }
-  
+
   /**
-  
+
 
   define UnorderedPairs datafu.pig.bags.UnorderedPairs();
-  
+
   data = LOAD 'input' AS (B: bag {T: tuple(v:INT)});
-  
+
   data2 = FOREACH data GENERATE UnorderedPairs(B) as P;
-  
+
   data3 = FOREACH data2 GENERATE FLATTEN(P);
-  
+
   data4 = FOREACH data3 GENERATE FLATTEN(elem1), FLATTEN(elem2);
-  
+
   data5 = ORDER data4 BY $0, $1;
-  
+
   STORE data5 INTO 'output';
 
 
    */
   @Multiline
   private String unorderedPairsTest;
-  
+
   @Test
   public void unorderedPairsTest() throws Exception
   {
     PigTest test = createPigTestFromString(unorderedPairsTest);
-    
+
     String[] input = {
       "{(1),(2),(3),(4),(5)}"
     };
-    
+
     String[] output = {
         "(1,2)",
         "(1,3)",
@@ -319,37 +319,37 @@ public class BagTests extends PigTests
         "(3,5)",
         "(4,5)"
       };
-    
+
     test.assertOutput("data",input,"data4",output);
   }
-  
+
   /**
-  
+
 
   define UnorderedPairs datafu.pig.bags.UnorderedPairs();
-  
+
   data = LOAD 'input' AS (A:int, B: bag {T: tuple(v:INT)});
-  
+
   data2 = FOREACH data GENERATE A, UnorderedPairs(B) as P;
-  
+
   data3 = FOREACH data2 GENERATE A, FLATTEN(P);
-  
+
   STORE data3 INTO 'output';
 
    */
   @Multiline
   private String unorderedPairsTest2;
-  
+
   @Test
   public void unorderedPairsTest2() throws Exception
   {
     PigTest test = createPigTestFromString(unorderedPairsTest2);
-        
+
     this.writeLinesToFile("input", "1\t{(1),(2),(3),(4),(5)}");
-    
+
     test.runScript();
     this.getLinesForAlias(test, "data3");
-    
+
     this.assertOutput(test, "data3",
                       "(1,(1),(2))",
                       "(1,(1),(3))",
@@ -360,110 +360,110 @@ public class BagTests extends PigTests
                       "(1,(2),(5))",
                       "(1,(3),(4))",
                       "(1,(3),(5))",
-                      "(1,(4),(5))");    
+                      "(1,(4),(5))");
   }
- 
+
   /**
-  
+
 
   define BagSplit datafu.pig.bags.BagSplit();
-  
+
   data = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
-  
+
   data2 = FOREACH data GENERATE BagSplit($MAX,B);
   --describe data2;
-  
+
   data3 = FOREACH data2 GENERATE FLATTEN($0);
-  
+
   --describe data3
-  
+
   STORE data3 INTO 'output';
 
    */
   @Multiline
   private String bagSplitTest;
-  
+
   @Test
   public void bagSplitTest() throws Exception
   {
     PigTest test = createPigTestFromString(bagSplitTest,
                                  "MAX=5");
-    
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data3",
                  "({(1,11),(2,22),(3,33),(4,44),(5,55)})",
                  "({(6,66),(7,77),(8,88),(9,99),(10,1010)})",
                  "({(11,1111),(12,1212)})");
   }
-  
+
   /**
-  
+
 
   define BagSplit datafu.pig.bags.BagSplit('true');
-  
+
   data = LOAD 'input' AS (B:bag{T:tuple(val1:INT,val2:INT)});
-  
+
   data2 = FOREACH data GENERATE BagSplit($MAX,B);
-  
+
   data3 = FOREACH data2 GENERATE FLATTEN($0);
-  
+
   STORE data3 INTO 'output';
    */
   @Multiline
   private String bagSplitWithBagNumTest;
-  
+
   @Test
   public void bagSplitWithBagNumTest() throws Exception
   {
     PigTest test = createPigTestFromString(bagSplitWithBagNumTest,
                                  "MAX=10");
-    
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "{(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010),(11,1111),(12,1212)}");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data3",
                  "({(1,11),(2,22),(3,33),(4,44),(5,55),(6,66),(7,77),(8,88),(9,99),(10,1010)},0)",
                  "({(11,1111),(12,1212)},1)");
   }
-  
+
   /**
-  
+
 
   define Enumerate datafu.pig.bags.ReverseEnumerate('1');
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-  
+
   data2 = FOREACH data GENERATE Enumerate(data);
   --describe data2;
-  
+
   data3 = FOREACH data2 GENERATE FLATTEN($0);
   --describe data3;
-  
+
   data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
   --describe data4;
-  
+
   STORE data4 INTO 'output';
 
    */
   @Multiline
   private String enumerateWithReverseTest;
-  
+
   @Test
   public void enumerateWithReverseTest() throws Exception
   {
     PigTest test = createPigTestFromString(enumerateWithReverseTest);
-       
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data4",
                  "(10,{(1),(2),(3)},5)",
                  "(20,{(4),(5),(6)},4)",
@@ -471,39 +471,39 @@ public class BagTests extends PigTests
                  "(40,{(9),(10),(11)},2)",
                  "(50,{(12),(13),(14),(15)},1)");
   }
-  
+
   /**
-  
+
 
   define Enumerate datafu.pig.bags.Enumerate('1');
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-  
+
   data2 = FOREACH data GENERATE Enumerate(data);
   --describe data2;
-  
+
   data3 = FOREACH data2 GENERATE FLATTEN($0);
   --describe data3;
-  
+
   data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
   --describe data4;
-  
+
   STORE data4 INTO 'output';
 
    */
   @Multiline
   private String enumerateWithStartTest;
-  
+
   @Test
   public void enumerateWithStartTest() throws Exception
   {
     PigTest test = createPigTestFromString(enumerateWithStartTest);
-       
-    writeLinesToFile("input", 
+
+    writeLinesToFile("input",
                      "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data4",
                  "(10,{(1),(2),(3)},1)",
                  "(20,{(4),(5),(6)},2)",
@@ -511,39 +511,39 @@ public class BagTests extends PigTests
                  "(40,{(9),(10),(11)},4)",
                  "(50,{(12),(13),(14),(15)},5)");
   }
-  
+
   /**
-  
+
 
   define Enumerate datafu.pig.bags.Enumerate();
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(v1:INT,B: bag{T: tuple(v2:INT)})});
-  
+
   data2 = FOREACH data GENERATE Enumerate(data);
   --describe data2;
-  
+
   data3 = FOREACH data2 GENERATE FLATTEN($0);
   --describe data3;
-  
+
   data4 = FOREACH data3 GENERATE $0 as v1, $1 as B, $2 as i;
   --describe data4;
-  
+
   STORE data4 INTO 'output';
 
    */
   @Multiline
   private String enumerateTest;
-  
+
   @Test
   public void enumerateTest() throws Exception
   {
     PigTest test = createPigTestFromString(enumerateTest);
-       
+
     writeLinesToFile("input",
                      "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data4",
                  "(10,{(1),(2),(3)},0)",
                  "(20,{(4),(5),(6)},1)",
@@ -551,18 +551,18 @@ public class BagTests extends PigTests
                  "(40,{(9),(10),(11)},3)",
                  "(50,{(12),(13),(14),(15)},4)");
   }
-  
+
   @Test
   public void enumerateTest2() throws Exception
   {
     PigTest test = createPigTestFromString(enumerateTest);
-      
+
     writeLinesToFile("input",
                      "({(10,{(1),(2),(3)}),(20,{(4),(5),(6)}),(30,{(7),(8)}),(40,{(9),(10),(11)}),(50,{(12),(13),(14),(15)})})",
                      "({(11,{(11),(12),(13),(14)}),(21,{(15),(16),(17),(18)}),(31,{(19),(20)}),(41,{(21),(22),(23),(24)}),(51,{(25),(26),(27)})})");
-   
+
     test.runScript();
-   
+
     assertOutput(test, "data4",
                  "(10,{(1),(2),(3)},0)",
                  "(20,{(4),(5),(6)},1)",
@@ -574,46 +574,46 @@ public class BagTests extends PigTests
                  "(31,{(19),(20)},2)",
                  "(41,{(21),(22),(23),(24)},3)",
                  "(51,{(25),(26),(27)},4)");
-  }  
-  
-  /* 
+  }
+
+  /*
    * Testing "Accumulator" part of Enumeration by manually invoking accumulate(), getValue() and cleanup()
    */
   @Test
   public void enumerateAccumulatorTest() throws Exception
   {
-    Enumerate enumerate = new Enumerate(); 
-    
+    Enumerate enumerate = new Enumerate();
+
     Tuple tuple1 = TupleFactory.getInstance().newTuple(1);
     tuple1.set(0, 10);
-    
+
     Tuple tuple2 = TupleFactory.getInstance().newTuple(1);
     tuple2.set(0, 20);
-    
+
     Tuple tuple3 = TupleFactory.getInstance().newTuple(1);
     tuple3.set(0, 30);
-    
+
     Tuple tuple4 = TupleFactory.getInstance().newTuple(1);
     tuple4.set(0, 40);
-    
+
     Tuple tuple5 = TupleFactory.getInstance().newTuple(1);
     tuple5.set(0, 50);
-    
+
     DataBag bag1 = BagFactory.getInstance().newDefaultBag();
     bag1.add(tuple1);
     bag1.add(tuple2);
     bag1.add(tuple3);
-    
+
     DataBag bag2 = BagFactory.getInstance().newDefaultBag();
     bag2.add(tuple4);
     bag2.add(tuple5);
-    
+
     Tuple inputTuple1 = TupleFactory.getInstance().newTuple(1);
     inputTuple1.set(0,bag1);
-    
+
     Tuple inputTuple2 = TupleFactory.getInstance().newTuple(1);
     inputTuple2.set(0,bag2);
-    
+
     enumerate.accumulate(inputTuple1);
     enumerate.accumulate(inputTuple2);
     assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
@@ -622,50 +622,50 @@ public class BagTests extends PigTests
     enumerate.cleanup();
     enumerate.accumulate(inputTuple1);
     enumerate.accumulate(inputTuple2);
-    assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");     
+    assertEquals(enumerate.getValue().toString(), "{(10,0),(20,1),(30,2),(40,3),(50,4)}");
   }
-  
+
   /**
-  
+
 
   define BagSplit datafu.pig.bags.BagSplit();
   define Enumerate datafu.pig.bags.Enumerate('1');
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(name:CHARARRAY, score:double)});
-  
+
   data2 = FOREACH data GENERATE BagSplit(3,data) as the_bags;
-  
+
   --describe data2
-  
+
   data3 = FOREACH data2 GENERATE Enumerate(the_bags) as enumerated_bags;
-  
+
   --describe data3
-  
+
   data4 = FOREACH data3 GENERATE FLATTEN(enumerated_bags) as (data,i);
-  
+
   --describe data4
-  
+
   data5 = FOREACH data4 GENERATE data as the_data, i as the_key;
-  
+
   --describe data5
-  
+
   data_out = FOREACH data5 GENERATE FLATTEN(the_data), the_key;
-  
+
   --describe data_out
    */
   @Multiline
   private String comprehensiveBagSplitAndEnumerate;
-  
+
   @Test
   public void comprehensiveBagSplitAndEnumerate() throws Exception
   {
     PigTest test = createPigTestFromString(comprehensiveBagSplitAndEnumerate);
-    
+
     writeLinesToFile("input",
                      "({(A,1.0),(B,2.0),(C,3.0),(D,4.0),(E,5.0)})");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data_out",
                  // bag #1
                  "(A,1.0,1)",
@@ -675,18 +675,18 @@ public class BagTests extends PigTests
                  "(D,4.0,2)",
                  "(E,5.0,2)");
   }
-  
+
   /**
-  
+
 
   define DistinctBy datafu.pig.bags.DistinctBy('0');
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(a:CHARARRAY, b:INT, c:INT)});
-  
+
   data2 = FOREACH data GENERATE DistinctBy(data);
-  
+
   --describe data2;
-  
+
   STORE data2 INTO 'output';
 
    */
@@ -697,18 +697,18 @@ public class BagTests extends PigTests
   public void distinctByTest() throws Exception
   {
     PigTest test = createPigTestFromString(distinctByTest);
-    
+
     writeLinesToFile("input",
                      "({(Z,1,0),(A,1,0),(A,1,0),(B,2,0),(B,22,1),(C,3,0),(D,4,0),(E,5,0)})",
                      "({(A,10,2),(M,50,3),(A,34,49), (A,24,42), (Z,49,22),(B,1,1)},(B,2,2))");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data2",
                  "({(Z,1,0),(A,1,0),(B,2,0),(C,3,0),(D,4,0),(E,5,0)})",
                  "({(A,10,2),(M,50,3),(Z,49,22),(B,1,1)})");
   }
- 
+
   /**
 
   define DistinctBy datafu.pig.bags.DistinctBy('1', '2');
@@ -732,9 +732,9 @@ public class BagTests extends PigTests
 
     writeLinesToFile("input",
                      "({(a-b,[a#0,b#1],{(a-b,0),(a-b,1)}),(a-c,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[a#1,b#0],{(a-b,1),(a-b,2)})})");
-    
+
     test.runScript();
- 
+
     assertOutput(test, "data2",
                  "({(a-b,[b#1,a#0],{(a-b,0),(a-b,1)}),(a-d,[b#0,a#1],{(a-b,1),(a-b,2)})})");
   }
@@ -759,104 +759,104 @@ public class BagTests extends PigTests
   public void distinctByDelimTest() throws Exception
   {
     PigTest test = createPigTestFromString(distinctByDelimTest);
-    
+
     writeLinesToFile("input",
                      "({(a-b,c),(a-b,d)})");
-    
+
     test.runScript();
-    
+
     assertOutput(test, "data2",
                  "({(a-b,c),(a-b,d)})");
   }
-  
+
   @Test
   public void distinctByExecTest() throws Exception
   {
     DistinctBy distinct = new DistinctBy("0");
-    
+
     DataBag bag;
     Tuple input;
     Tuple data;
-   
+
     bag = BagFactory.getInstance().newDefaultBag();
     input = TupleFactory.getInstance().newTuple(bag);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 10);
     data.set(1, 20);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 11);
     data.set(1, 50);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 10);
     data.set(1, 22);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 12);
     data.set(1, 40);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 11);
     data.set(1, 50);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 11);
     data.set(1, 51);
-        
+
     DataBag result = distinct.exec(input);
-    
+
     Assert.assertEquals(3, result.size());
-    
+
     Iterator<Tuple> iter = result.iterator();
     Assert.assertEquals("(10,20)", iter.next().toString());
     Assert.assertEquals("(11,50)", iter.next().toString());
     Assert.assertEquals("(12,40)", iter.next().toString());
-    
+
     // do it again to test cleanup
     bag = BagFactory.getInstance().newDefaultBag();
     input = TupleFactory.getInstance().newTuple(bag);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 12);
     data.set(1, 42);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 11);
     data.set(1, 51);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag.add(data);
     data.set(0, 11);
     data.set(1, 50);
-    
+
     result = distinct.exec(input);
-    
+
     Assert.assertEquals(2, result.size());
-    
+
     iter = result.iterator();
     Assert.assertEquals("(12,42)", iter.next().toString());
     Assert.assertEquals("(11,51)", iter.next().toString());
   }
-  
+
   @Test
   public void distinctByAccumulateTest() throws Exception
   {
     DistinctBy distinct = new DistinctBy("0");
-    
+
     DataBag bag;
     Tuple input;
     Tuple data;
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -864,7 +864,7 @@ public class BagTests extends PigTests
     data.set(0, 10);
     data.set(1, 20);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -872,7 +872,7 @@ public class BagTests extends PigTests
     data.set(0, 11);
     data.set(1, 50);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -880,7 +880,7 @@ public class BagTests extends PigTests
     data.set(0, 10);
     data.set(1, 22);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -888,7 +888,7 @@ public class BagTests extends PigTests
     data.set(0, 12);
     data.set(1, 40);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -896,7 +896,7 @@ public class BagTests extends PigTests
     data.set(0, 11);
     data.set(1, 50);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -904,19 +904,19 @@ public class BagTests extends PigTests
     data.set(0, 11);
     data.set(1, 51);
     distinct.accumulate(input);
-    
+
     DataBag result = distinct.getValue();
-    
+
     Assert.assertEquals(3, result.size());
-    
+
     Iterator<Tuple> iter = result.iterator();
     Assert.assertEquals("(10,20)", iter.next().toString());
     Assert.assertEquals("(11,50)", iter.next().toString());
     Assert.assertEquals("(12,40)", iter.next().toString());
-    
+
     // do it again to test cleanup
     distinct.cleanup();
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -924,7 +924,7 @@ public class BagTests extends PigTests
     data.set(0, 12);
     data.set(1, 42);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -932,7 +932,7 @@ public class BagTests extends PigTests
     data.set(0, 11);
     data.set(1, 51);
     distinct.accumulate(input);
-    
+
     data = TupleFactory.getInstance().newTuple(2);
     bag = BagFactory.getInstance().newDefaultBag();
     bag.add(data);
@@ -940,98 +940,98 @@ public class BagTests extends PigTests
     data.set(0, 11);
     data.set(1, 50);
     distinct.accumulate(input);
-    
+
     result = distinct.getValue();
-    
+
     Assert.assertEquals(2, result.size());
-    
+
     iter = result.iterator();
     Assert.assertEquals("(12,42)", iter.next().toString());
     Assert.assertEquals("(11,51)", iter.next().toString());
   }
-  
+
   /**
-  
+
 
   define CountEach datafu.pig.bags.CountEach();
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(v1:chararray)});
-  
+
   data2 = FOREACH data GENERATE CountEach(data) as counted;
   --describe data2;
-  
+
   data3 = FOREACH data2 {
     ordered = ORDER counted BY count DESC;
     GENERATE ordered;
   }
   --describe data3
-  
+
   STORE data3 INTO 'output';
 
    */
   @Multiline
   private String countEachTest;
- 
-  @Test 
+
+  @Test
   public void countEachTest() throws Exception
   {
     PigTest test = createPigTestFromString(countEachTest);
 
-    writeLinesToFile("input", 
+    writeLinesToFile("input",
                      "({(A),(B),(A),(C),(A),(B)})");
-                  
+
     test.runScript();
-            
+
     assertOutput(test, "data3",
         "({((A),3),((B),2),((C),1)})");
   }
-  
-  @Test 
+
+  @Test
   public void countEachExecAndAccumulateTest() throws Exception
-  {    
+  {
     for (int c=0; c<2; c++)
     {
       CountEach countEach = new CountEach("flatten");
-      
+
       DataBag bag = BagFactory.getInstance().newDefaultBag();
-      { 
+      {
         Tuple t = TupleFactory.getInstance().newTuple(1);
         t.set(0, "A");
         bag.add(t);
       }
-      { 
+      {
         Tuple t = TupleFactory.getInstance().newTuple(1);
         t.set(0, "B");
         bag.add(t);
       }
-      { 
+      {
         Tuple t = TupleFactory.getInstance().newTuple(1);
         t.set(0, "B");
         bag.add(t);
       }
-      { 
+      {
         Tuple t = TupleFactory.getInstance().newTuple(1);
         t.set(0, "C");
         bag.add(t);
       }
-      { 
+      {
         Tuple t = TupleFactory.getInstance().newTuple(1);
         t.set(0, "A");
         bag.add(t);
       }
-      { 
+      {
         Tuple t = TupleFactory.getInstance().newTuple(1);
         t.set(0, "D");
         bag.add(t);
       }
-      
+
       DataBag output = null;
-      
+
       if (c == 0)
       {
         Tuple input = TupleFactory.getInstance().newTuple(1);
         input.set(0, bag);
-        
+
         System.out.println("Testing exec");
         output = countEach.exec(input);
       }
@@ -1046,21 +1046,21 @@ public class BagTests extends PigTests
           input.set(0, tb);
           countEach.accumulate(input);
         }
-        
+
         output = countEach.getValue();
-        
-        countEach.cleanup();        
+
+        countEach.cleanup();
         Assert.assertEquals(0, countEach.getValue().size());
       }
-      
+
       System.out.println(output.toString());
-      
+
       Assert.assertEquals(4, output.size());
       Set<String> found = new HashSet<String>();
       for (Tuple t : output)
       {
-        String key = (String)t.get(0);    
-        found.add(key);  
+        String key = (String)t.get(0);
+        found.add(key);
         if (key == "A")
         {
           Assert.assertEquals(2, t.get(1));
@@ -1085,94 +1085,94 @@ public class BagTests extends PigTests
       Assert.assertEquals(4, found.size());
     }
   }
-  
+
   /**
-  
+
 
   define CountEach datafu.pig.bags.CountEach('flatten');
-  
+
   data = LOAD 'input' AS (data: bag {T: tuple(v1:chararray)});
-  
+
   data2 = FOREACH data GENERATE CountEach(data) as counted;
   --describe data2;
-  
+
   data3 = FOREACH data2 {
     ordered = ORDER counted BY count DESC;
     GENERATE ordered;
   }
   --describe data3
-  
+
   STORE data3 INTO 'output';
 
    */
   @Multiline
   private String countEachFlattenTest;
-  
-  @Test 
+
+  @Test
   public void countEachFlattenTest() throws Exception
   {
     PigTest test = createPigTestFromString(countEachFlattenTest);
 
-    writeLinesToFile("input", 
+    writeLinesToFile("input",
                      "({(A),(B),(A),(C),(A),(B)})");
-                  
+
     test.runScript();
-            
+
     assertOutput(test, "data3",
         "({(A,3),(B,2),(C,1)})");
   }
-  
+
   /**
-  
+
 
   define BagLeftOuterJoin datafu.pig.bags.BagLeftOuterJoin();
-  
+
   data = LOAD 'input' AS (outer_key:chararray, bag1:bag{T:tuple(k:chararray,v:chararray)}, bag2:bag{T:tuple(k:chararray,v:chararray)}, bag3:bag{T:tuple(k3:chararray,v3:chararray)});
   describe data;
-  
-  data2 = FOREACH data GENERATE 
-    outer_key, 
+
+  data2 = FOREACH data GENERATE
+    outer_key,
     BagLeftOuterJoin(bag1, 'k', bag2, 'k', bag3, 'k3') as joined1,
     BagLeftOuterJoin(bag1, 'k', bag3, 'k3', bag2, 'k') as joined2; --this will break without UDF signature and pig < 0.11
   describe data2;
-  
+
   STORE data2 INTO 'output';
 
    */
   @Multiline
   private String bagLeftOuterJoinTest;
-  
-  @Test 
+
+  @Test
   public void bagLeftOuterJoinTest() throws Exception
   {
     PigTest test = createPigTestFromString(bagLeftOuterJoinTest);
 
-    writeLinesToFile("input", 
+    writeLinesToFile("input",
                      "1\t{(K1,A1),(K2,B1),(K3,C1)}\t{(K1,A2),(K2,B2),(K2,B22)}\t{(K1,A3),(K3,C3),(K4,D3)}");
-                  
+
     test.runScript();
-    
+
     assertOutput(test, "data2",
         "(1,{(K1,A1,K1,A2,K1,A3),(K2,B1,K2,B2,,),(K2,B1,K2,B22,,),(K3,C1,,,K3,C3)},{(K1,A1,K1,A3,K1,A2),(K2,B1,,,K2,B2),(K2,B1,,,K2,B22),(K3,C1,K3,C3,,)})");
   }
-  
+
   /**
-  
+
 
   define BagUnion datafu.pig.bags.BagConcat();
-  
+
   data = LOAD 'input' AS (input_bag: bag {T: tuple(inner_bag: bag {T2: tuple(k: int, v: chararray)})});
   describe data;
-  
+
   data2 = FOREACH data GENERATE BagUnion(input_bag) as unioned;
   describe data2;
-  
+
   STORE data INTO 'output';
 
    */
   @Multiline
   private String bagUnionTest;
-  
+
   @Test
   public void bagUnionTest() throws Exception
   {
@@ -1181,67 +1181,105 @@ public class BagTests extends PigTests
     test.runScript();
     assertOutput(test, "data2", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})");
   }
- 
+
   /**
-  
+
 
   define BagGroup datafu.pig.bags.BagGroup();
-  
+
   data = LOAD 'input' AS (input_bag: bag {T: tuple(k: chararray, v: chararray)});
   describe data;
-  
+
   data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.k) as grouped;
   describe data2;
-  
+
   data3 = FOREACH data2 {
     ordered = ORDER grouped BY group;
     GENERATE
       ordered as grouped;
   }
   describe data3;
-  
+
   STORE data INTO 'output';
 
    */
   @Multiline
   private String bagGroupSingleTest;
-  
+
   @Test
   public void bagGroupSingleTest() throws Exception
   {
     PigTest test = createPigTestFromString(bagGroupSingleTest);
     writeLinesToFile("input", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})",
                      "({(A,1),(B,1),(A,2),(B,2),(C,2),(A,3)})");
-    test.runScript(); 
+    test.runScript();
     getLinesForAlias(test, "data2", true);
     assertOutput(test, "data3", "({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})",
                  "({(A,{(A,1),(A,2),(A,3)}),(B,{(B,1),(B,2)}),(C,{(C,2)})})");
   }
-  
+
   /**
-  
+
 
   define BagGroup datafu.pig.bags.BagGroup();
-  
+
+  data = LOAD 'input' AS (input_bag: bag {T: tuple(k: chararray, v: chararray)});
+  describe data;
+
+  data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.k) as grouped;
+  describe data2;
+
+  data3 = FOREACH data2 {
+    ordered = ORDER grouped BY group;
+    -- project only the value
+    projected = FOREACH ordered GENERATE group, input_bag.(v);
+    GENERATE
+      projected as grouped;
+  }
+  describe data3;
+
+  STORE data INTO 'output';
+
+   */
+  @Multiline
+  private String bagGroupProjectTest;
+
+  @Test(description="BagGroup with projection of the value from the bag")
+  public void bagGroupProjectTest() throws Exception
+  {
+    PigTest test = createPigTestFromString(bagGroupProjectTest);
+    writeLinesToFile("input", "({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})",
+                     "({(A,1),(B,1),(A,2),(B,2),(C,2),(A,3)})");
+    test.runScript();
+    getLinesForAlias(test, "data2", true);
+    assertOutput(test, "data3", "({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})",
+                 "({(A,{(1),(2),(3)}),(B,{(1),(2)}),(C,{(2)})})");
+  }
+
+  /**
+
+
+  define BagGroup datafu.pig.bags.BagGroup();
+
   data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, k2: chararray, v: int)});
   describe data;
-  
+
   data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k, k2)) as grouped;
   describe data2;
-  
+
   data3 = FOREACH data2 {
     ordered = ORDER grouped BY group;
     GENERATE
       ordered as grouped;
   }
   describe data3;
-  
+
   STORE data INTO 'output';
 
    */
   @Multiline
   private String bagGroupMultipleTest;
-  
+
   @Test
   public void bagGroupMultipleTest() throws Exception
   {

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
----------------------------------------------------------------------
diff --git a/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html b/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
index 2f27539..35c2557 100644
--- a/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
+++ b/site/source/docs/datafu/1.2.0/datafu/pig/bags/BagGroup.html
@@ -2,12 +2,12 @@
 <!--NewPage-->
 <HTML>
 <HEAD>
-<!-- Generated by javadoc (build 1.6.0_27) on Thu Dec 12 19:05:53 PST 2013 -->
+<!-- Generated by javadoc (build 1.6.0_27) on Mon Apr 28 10:08:50 PDT 2014 -->
 <TITLE>
 BagGroup (DataFu 1.2.0)
 </TITLE>
 
-<META NAME="date" CONTENT="2013-12-12">
+<META NAME="date" CONTENT="2014-04-28">
 
 <LINK REL ="stylesheet" TYPE="text/css" HREF="../../../stylesheet.css" TITLE="Style">
 
@@ -40,7 +40,6 @@ function windowTitle()
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-summary.html"><FONT CLASS="NavBarFont1"><B>Package</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> &nbsp;<FONT CLASS="NavBarFont1Rev"><B>Class</B></FONT>&nbsp;</TD>
-  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="class-use/BagGroup.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A>&nbsp;</TD>
@@ -105,28 +104,44 @@ java.lang.Object
 
 <P>
 Performs an in-memory group operation on a bag.  The first argument is the bag.
- The second argument is a projection of that bag to the group keys.
+ The second argument is a projection of that bag to the keys to group by.
 
  <p>
- Example:
- <code>
- define BagGroup datafu.pig.bags.BagGroup();
- 
+ The following example groups input_bag by k.  The output is a bag having tuples
+ consisting of the group key k and a bag with the corresponding (k,v) tuples from input_bag
+ for that key.
+ <pre>
+ <code>define BagGroup datafu.pig.bags.BagGroup();
+
  data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
  -- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
- 
+
+ -- Group input_bag by k
  data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k)) as grouped;
  -- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
  -- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
  </code>
+ </pre>
+ </p>
+
+ <p>
+ If the key k is not needed within the input_bag for the output, it can be projected
+ out like so:
+ <pre>
+ <code>data3 = FOREACH data2 {
+   -- project only the value
+   projected = FOREACH grouped GENERATE group, input_bag.(v);
+   GENERATE projected as grouped;
+ }
+
+ -- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+ -- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
+ </code>
+ </pre>
  </p>
 <P>
 
 <P>
-<DL>
-<DT><B>Author:</B></DT>
-  <DD>wvaughan</DD>
-</DL>
 <HR>
 
 <P>
@@ -305,7 +320,6 @@ public org.apache.pig.data.DataBag <B>exec</B>(org.apache.pig.data.Tuple&nbsp;in
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-summary.html"><FONT CLASS="NavBarFont1"><B>Package</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> &nbsp;<FONT CLASS="NavBarFont1Rev"><B>Class</B></FONT>&nbsp;</TD>
-  <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="class-use/BagGroup.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A>&nbsp;</TD>
   <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1">    <A HREF="../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A>&nbsp;</TD>
@@ -350,6 +364,6 @@ DETAIL:&nbsp;FIELD&nbsp;|&nbsp;<A HREF="#constructor_detail">CONSTR</A>&nbsp;|&n
 <!-- ======== END OF BOTTOM NAVBAR ======= -->
 
 <HR>
-Matthew Hayes, Sam Shah
+
 </BODY>
 </HTML>

http://git-wip-us.apache.org/repos/asf/incubator-datafu/blob/75162f64/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
----------------------------------------------------------------------
diff --git a/site/source/docs/datafu/guide/bag-operations.html.markdown.erb b/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
index 70d73c6..799f438 100644
--- a/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
+++ b/site/source/docs/datafu/guide/bag-operations.html.markdown.erb
@@ -71,6 +71,47 @@ input = LOAD 'input' AS (A: bag{T: tuple(bag{T2: tuple(v:INT)})});
 output = FOREACH input GENERATE BagConcat(A);
 ```
 
+### Grouping Within a Bag
+
+Pig has a `GROUP` operation that can be applied to a relation.  It produces a new relation where the input
+tuples are grouped by a particular key.  A bag in the relation contains the grouped tuples for that key.  The key
+is represented by a `group` parameter.
+
+[BagGroup](/docs/datafu/<%= current_page.data.version %>/datafu/pig/bags/BagGroup.html) mimics the `GROUP`
+operation from Pig.  The difference between them is that `BagGroup` operates within a bag, rather than on a relation.
+This can be useful when operating on bags that are not very large in size.  We can operate on the tuples within
+this bag without involving `GROUP`, which would result in another MapReduce job.  With `BagGroup` we can avoid this.
+
+In the following example we take an `input_bag` consisting of key-value pairs `(k,v)` and group the tuples by `k`.
+This produces a new bag having tuples consisting of `group` and `input_bag`.  The `group` corresponds to the grouping
+key `k`.  The `input_bag` is a bag containing the tuples from the original `input_bag` that have the same `k` as `group`.
+
+```pig
+ define BagGroup datafu.pig.bags.BagGroup();
+ 
+ data = LOAD 'input' AS (input_bag: bag {T: tuple(k: int, v: chararray)});
+ -- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
+ 
+ -- Group input_bag by k
+ data2 = FOREACH data GENERATE BagGroup(input_bag, input_bag.(k)) as grouped;
+ -- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+ -- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
+```
+
+We could also project out the key from the final `input_bag` using a nested `FOREACH` so that the bag only
+consists of the value `v`:
+
+```pig
+data3 = FOREACH data2 {
+  -- project only the value
+  projected = FOREACH grouped GENERATE group, input_bag.(v);
+  GENERATE projected as grouped;
+}
+
+-- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
+-- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
+```
+
 ### Append to Bag
 
 [AppendToBag](/docs/datafu/<%= current_page.data.version %>/datafu/pig/bags/AppendToBag.html) can be