You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2015/10/21 19:00:40 UTC

svn commit: r1709884 [5/8] - in /incubator/datafu/site: ./ blog/ blog/2012/01/10/ blog/2013/01/24/ blog/2013/09/04/ blog/2013/10/03/ blog/2014/04/27/ community/ docs/ docs/datafu/ docs/datafu/guide/ docs/hourglass/ javascripts/ stylesheets/

Modified: incubator/datafu/site/docs/datafu/guide/more-tips-and-tricks.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/more-tips-and-tricks.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/more-tips-and-tricks.html (original)
+++ incubator/datafu/site/docs/datafu/guide/more-tips-and-tricks.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>More Tips and Tricks - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_more-tips-and-tricks">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,31 +67,33 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">More Tips and Tricks</h2>
+      <h2 id="more-tips-and-tricks">More Tips and Tricks</h2>
 
-<h3 id="toc_1">Coalesce</h3>
+<h3 id="coalesce">Coalesce</h3>
 
 <p>Using ternary operators is fairly common in Pig.  For example, often you want to replace null
 values with zero:</p>
-<pre class="highlight pig"><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="p">(</span><span class="n">val</span> <span class="k">IS</span> <span class="k">NOT</span> <span class="n">NULL</span> <span class="o">?</span> <span class="n">val</span> <span class="p">:</span> <span class="mi">0</span><span class="p">)</span> <span class="k">as</span> <span class="n">result</span><span class="p">;</span>
-</pre>
+<pre class="highlight pig"><code><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="p">(</span><span class="n">val</span> <span class="k">IS</span> <span class="k">NOT</span> <span class="n">NULL</span> <span class="o">?</span> <span class="n">val</span> <span class="p">:</span> <span class="mi">0</span><span class="p">)</span> <span class="k">as</span> <span class="n">result</span><span class="p">;</span>
+</code></pre>
+
 <p>Or, sometimes you want to return the first non-null value among several fields:</p>
-<pre class="highlight pig"><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="p">(</span><span class="n">val1</span> <span class="k">IS</span> <span class="k">NOT</span> <span class="n">NULL</span> <span class="o">?</span> <span class="n">val1</span> <span class="p">:</span> 
+<pre class="highlight pig"><code><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="p">(</span><span class="n">val1</span> <span class="k">IS</span> <span class="k">NOT</span> <span class="n">NULL</span> <span class="o">?</span> <span class="n">val1</span> <span class="p">:</span> 
                <span class="p">(</span><span class="n">val2</span> <span class="k">IS</span> <span class="k">NOT</span> <span class="n">NULL</span> <span class="o">?</span> <span class="n">val2</span> <span class="p">:</span>
                <span class="p">(</span><span class="n">val3</span> <span class="k">IS</span> <span class="k">NOT</span> <span class="n">NULL</span> <span class="o">?</span> <span class="n">val3</span> <span class="p">:</span>
                <span class="n">NULL</span><span class="p">)))</span> <span class="k">as</span> <span class="n">result</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>The above code is very hard to follow, and it is very cumersome to write.  To solve this problem,
 Apache DataFu provides the useful
 <a href="/docs/datafu/1.2.0/datafu/pig/util/Coalesce.html">Coalesce</a>.  This is
@@ -94,30 +102,35 @@ in some SQL implementations.  It simply
 With <code>Coalesce</code> we can clean up the code above.</p>
 
 <p>To replace any null value with 0:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">EmptyBagToNullFields</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">util</span><span class="p">.</span><span class="n">Coalesce</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">EmptyBagToNullFields</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">util</span><span class="p">.</span><span class="n">Coalesce</span><span class="p">();</span>
 
 <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">Coalesce</span><span class="p">(</span><span class="n">val</span><span class="p">,</span><span class="mi">0</span><span class="p">)</span> <span class="k">as</span> <span class="n">result</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>To return the first non-null value:</p>
-<pre class="highlight pig"><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">Coalesce</span><span class="p">(</span><span class="n">val1</span><span class="p">,</span><span class="n">val2</span><span class="p">,</span><span class="n">val3</span><span class="p">)</span> <span class="k">as</span> <span class="n">result</span><span class="p">;</span>
-</pre>
-<h3 id="toc_2">Left Joining Multiple Relations</h3>
+<pre class="highlight pig"><code><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">Coalesce</span><span class="p">(</span><span class="n">val1</span><span class="p">,</span><span class="n">val2</span><span class="p">,</span><span class="n">val3</span><span class="p">)</span> <span class="k">as</span> <span class="n">result</span><span class="p">;</span>
+</code></pre>
+
+<h3 id="left-joining-multiple-relations">Left Joining Multiple Relations</h3>
 
 <p>Suppose we have three data sets:</p>
-<pre class="highlight pig"><span class="n">input1</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input1'</span> <span class="k">using</span> <span class="n">PigStorage</span><span class="p">(</span><span class="s1">','</span><span class="p">)</span> <span class="k">AS</span> <span class="p">(</span><span class="n">key</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">val</span><span class="p">:</span><span class="n">INT</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="n">input1</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input1'</span> <span class="k">using</span> <span class="n">PigStorage</span><span class="p">(</span><span class="s1">','</span><span class="p">)</span> <span class="k">AS</span> <span class="p">(</span><span class="n">key</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">val</span><span class="p">:</span><span class="n">INT</span><span class="p">);</span>
 <span class="n">input2</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input2'</span> <span class="k">using</span> <span class="n">PigStorage</span><span class="p">(</span><span class="s1">','</span><span class="p">)</span> <span class="k">AS</span> <span class="p">(</span><span class="n">key</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">val</span><span class="p">:</span><span class="n">INT</span><span class="p">);</span>
 <span class="n">input3</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input3'</span> <span class="k">using</span> <span class="n">PigStorage</span><span class="p">(</span><span class="s1">','</span><span class="p">)</span> <span class="k">AS</span> <span class="p">(</span><span class="n">key</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">val</span><span class="p">:</span><span class="n">INT</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>Let&#39;s say we want to left join <code>input1</code> with <code>input2</code> and <code>input3</code>.  You can do this in
 SQL.  Unfortunately Pig does not support outer joins on more than two relations.</p>
-<pre class="highlight pig"><span class="c1">-- DOES NOT WORK
+<pre class="highlight pig"><code><span class="c1">-- DOES NOT WORK
 </span><span class="n">joined</span> <span class="o">=</span> <span class="k">JOIN</span> <span class="n">input1</span> <span class="k">BY</span> <span class="n">key</span> <span class="k">LEFT</span><span class="p">,</span>
          <span class="n">input2</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input3</span> <span class="k">BY</span> <span class="n">key</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>Instead, you have to join twice, which means two MapReduce jobs:</p>
-<pre class="highlight pig"><span class="n">data1</span> <span class="o">=</span> <span class="k">JOIN</span> <span class="n">input1</span> <span class="k">BY</span> <span class="n">key</span> <span class="k">LEFT</span><span class="p">,</span> <span class="n">input2</span> <span class="k">BY</span> <span class="n">key</span><span class="p">;</span>
+<pre class="highlight pig"><code><span class="n">data1</span> <span class="o">=</span> <span class="k">JOIN</span> <span class="n">input1</span> <span class="k">BY</span> <span class="n">key</span> <span class="k">LEFT</span><span class="p">,</span> <span class="n">input2</span> <span class="k">BY</span> <span class="n">key</span><span class="p">;</span>
 <span class="n">data2</span> <span class="o">=</span> <span class="k">JOIN</span> <span class="n">data1</span> <span class="k">BY</span> <span class="n">input1</span><span class="p">::</span><span class="n">key</span> <span class="k">LEFT</span><span class="p">,</span> <span class="n">input3</span> <span class="k">BY</span> <span class="n">key</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>This is unfortunate, as left joins are very common, and for some applications it is common
 to need to left join multiple relations.  Take a recommendation system for example: you
 start with a set of candidates to score and you join in multiple sets of features.  Each
@@ -126,14 +139,15 @@ set of features requires another join.</
 <p>You can, however, perform a left join effectively by using <code>COGROUP</code> with multiple relations
 and applying clever use of <code>FLATTEN</code>.  Then only a single MapReduce job is required.
 But this gets pretty messy:</p>
-<pre class="highlight pig"><span class="n">data1</span> <span class="o">=</span> <span class="k">COGROUP</span> <span class="n">input1</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input2</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input3</span> <span class="k">BY</span> <span class="n">key</span><span class="p">;</span>
+<pre class="highlight pig"><code><span class="n">data1</span> <span class="o">=</span> <span class="k">COGROUP</span> <span class="n">input1</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input2</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input3</span> <span class="k">BY</span> <span class="n">key</span><span class="p">;</span>
 <span class="n">data2</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data1</span> <span class="k">GENERATE</span>
  <span class="k">FLATTEN</span><span class="p">(</span><span class="n">input1</span><span class="p">),</span> <span class="c1">-- left join on this
 </span> <span class="k">FLATTEN</span><span class="p">((</span><span class="n">IsEmpty</span><span class="p">(</span><span class="n">input2</span><span class="p">)</span> <span class="o">?</span> <span class="n">TOBAG</span><span class="p">(</span><span class="n">TOTUPLE</span><span class="p">((</span><span class="n">int</span><span class="p">)</span><span class="n">null</span><span class="p">,(</span><span class="n">int</span><span class="p">),</span><span class="n">null</span><span class="p">))</span> <span class="p">:</span> <span class="n">input2</span><span class="p">))</span>
    <span class="k">AS</span> <span class="p">(</span><span class="n">input2</span><span class="p">::</span><span class="n">key</span><span class="p">,</span><span class="n">input2</span><span class="p">::</span><span class="n">val</span><span class="p">),</span>
 <span class="k">FLATTEN</span><span class="p">((</span><span class="n">IsEmpty</span><span class="p">(</span><span class="n">input3</span><span class="p">)</span> <span class="o">?</span> <span class="n">TOBAG</span><span class="p">(</span><span class="n">TOTUPLE</span><span class="p">((</span><span class="n">int</span><span class="p">)</span><span class="n">null</span><span class="p">,(</span><span class="n">int</span><span class="p">),</span><span class="n">null</span><span class="p">))</span> <span class="p">:</span> <span class="n">input3</span><span class="p">))</span>
    <span class="k">AS</span> <span class="p">(</span><span class="n">input3</span><span class="p">::</span><span class="n">key</span><span class="p">,</span><span class="n">input3</span><span class="p">::</span><span class="n">val</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>As messy as this looks, it does work.  It relies on the fact that flattening an empty bag produces
 no output.  Therefore any records not appearing in <code>input</code> will be removed.  Since we don&#39;t want
 the lack of records in <code>input2</code> or <code>input3</code> to cause records to be removed, we replace the empty
@@ -143,32 +157,36 @@ simulate a left join on multiple relatio
 <p>To clean up this code, DataFu provides the
 <a href="/docs/datafu/1.2.0/datafu/pig/bags/EmptyBagToNullFields.html">EmptyBagToNullFields</a>
 UDF.  This performs the same logic above and makes the code much easier to write and understand:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">EmptyBagToNullFields</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">EmptyBagToNullFields</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">EmptyBagToNullFields</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">EmptyBagToNullFields</span><span class="p">();</span>
 
 <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">COGROUP</span> <span class="n">input1</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input2</span> <span class="k">BY</span> <span class="n">key</span><span class="p">,</span> <span class="n">input3</span> <span class="k">BY</span> <span class="n">key</span><span class="p">)</span> <span class="k">GENERATE</span>
   <span class="k">FLATTEN</span><span class="p">(</span><span class="n">input1</span><span class="p">),</span> <span class="c1">-- left join on this
 </span>  <span class="k">FLATTEN</span><span class="p">(</span><span class="n">EmptyBagToNullFields</span><span class="p">(</span><span class="n">input2</span><span class="p">)),</span>  
   <span class="k">FLATTEN</span><span class="p">(</span><span class="n">EmptyBagToNullFields</span><span class="p">(</span><span class="n">input3</span><span class="p">));</span>
-</pre>
+</code></pre>
+
 <p>While you&#39;re at it, why not create a macro:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">left_outer_join</span><span class="p">(</span><span class="n">relation1</span><span class="p">,</span> <span class="n">key1</span><span class="p">,</span> <span class="n">relation2</span><span class="p">,</span> <span class="n">key2</span><span class="p">,</span> <span class="n">relation3</span><span class="p">,</span> <span class="n">key3</span><span class="p">)</span> <span class="n">returns</span> <span class="n">joined</span> <span class="p">{</span>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">left_outer_join</span><span class="p">(</span><span class="n">relation1</span><span class="p">,</span> <span class="n">key1</span><span class="p">,</span> <span class="n">relation2</span><span class="p">,</span> <span class="n">key2</span><span class="p">,</span> <span class="n">relation3</span><span class="p">,</span> <span class="n">key3</span><span class="p">)</span> <span class="n">returns</span> <span class="n">joined</span> <span class="p">{</span>
   <span class="n">cogrouped</span> <span class="o">=</span> <span class="k">COGROUP</span> <span class="n">$relation1</span> <span class="k">BY</span> <span class="n">$key1</span><span class="p">,</span> <span class="n">$relation2</span> <span class="k">BY</span> <span class="n">$key2</span><span class="p">,</span> <span class="n">$relation3</span> <span class="k">BY</span> <span class="n">$key3</span><span class="p">;</span>
   <span class="n">$joined</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">cogrouped</span> <span class="k">GENERATE</span> 
     <span class="k">FLATTEN</span><span class="p">(</span><span class="n">$relation1</span><span class="p">),</span> 
     <span class="k">FLATTEN</span><span class="p">(</span><span class="n">EmptyBagToNullFields</span><span class="p">(</span><span class="n">$relation2</span><span class="p">)),</span> 
     <span class="k">FLATTEN</span><span class="p">(</span><span class="n">EmptyBagToNullFields</span><span class="p">(</span><span class="n">$relation3</span><span class="p">));</span>
 <span class="p">}</span>
-</pre>
+</code></pre>
+
 <p>Now you can simply apply your left join macro:</p>
-<pre class="highlight pig"><span class="n">features</span> <span class="o">=</span> <span class="n">left_outer_join</span><span class="p">(</span><span class="n">input1</span><span class="p">,</span> <span class="n">val1</span><span class="p">,</span> <span class="n">input2</span><span class="p">,</span> <span class="n">val2</span><span class="p">,</span> <span class="n">input3</span><span class="p">,</span> <span class="n">val3</span><span class="p">);</span>
-</pre>
+<pre class="highlight pig"><code><span class="n">features</span> <span class="o">=</span> <span class="n">left_outer_join</span><span class="p">(</span><span class="n">input1</span><span class="p">,</span> <span class="n">val1</span><span class="p">,</span> <span class="n">input2</span><span class="p">,</span> <span class="n">val2</span><span class="p">,</span> <span class="n">input3</span><span class="p">,</span> <span class="n">val3</span><span class="p">);</span>
+</code></pre>
+
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/sampling.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/sampling.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/sampling.html (original)
+++ incubator/datafu/site/docs/datafu/guide/sampling.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Sampling - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_sampling">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,23 +67,23 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Sampling</h2>
+      <h2 id="sampling">Sampling</h2>
 
 <p>Pig has a built-in <code>SAMPLE</code> operator that performs <a href="http://en.wikipedia.org/wiki/Bernoulli_sampling">Bernoulli sampling</a>
 on a relation.  Apache DataFu Pig provides additional sampling techniques for when Bernoulli sampling is not applicable.</p>
 
-<h3 id="toc_1">Simple Random Sampling</h3>
+<h3 id="simple-random-sampling">Simple Random Sampling</h3>
 
 <p><a href="http://en.wikipedia.org/wiki/Simple_random_sampling">Simple Random Sampling</a> produces samples of a specific size,
 where each item has the same probability of being chosen.  DataFu has scalable implementations of this that will
@@ -86,7 +92,7 @@ Pig&#39;s <code>SAMPLE</code>, on the ot
 where <code>p</code> is the sampling probability and <code>n</code> is the sample size.  With <code>SAMPLE</code> there are no guarantees on the size of the
 generated sample.</p>
 
-<h4 id="toc_2">Simple Random Sample Without Replacement</h4>
+<h4 id="simple-random-sample-without-replacement">Simple Random Sample Without Replacement</h4>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/sampling/SimpleRandomSample.html">SimpleRandomSample</a>
 implements scalable simple random sampling.
@@ -97,19 +103,21 @@ means that no item will appear more than
 
 <p>To use it simply pass in the sampling probability into the UDF&#39;s constructor and then pass in a bag to be sampled.
 For example, the following will produce a 1% sample:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">SRS</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SimpleRandomSample</span><span class="p">(</span><span class="s1">'0.01'</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">SRS</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SimpleRandomSample</span><span class="p">(</span><span class="s1">'0.01'</span><span class="p">);</span>
 
 <span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">x</span><span class="p">:</span><span class="n">double</span><span class="p">);</span>
 <span class="n">sampled</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">input</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="k">FLATTEN</span><span class="p">(</span><span class="n">SRS</span><span class="p">(</span><span class="n">input</span><span class="p">));</span>
-</pre>
+</code></pre>
+
 <p>This UDF can also be used to perform <a href="http://en.wikipedia.org/wiki/Stratified_sampling">stratified sampling</a>.
 For example, the following takes a 1% stratified sample using a label and a proportional allocation strategy:</p>
-<pre class="highlight pig"> <span class="k">DEFINE</span> <span class="n">SRS</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SimpleRandomSample</span><span class="p">(</span><span class="s1">'0.01'</span><span class="p">);</span>
+<pre class="highlight pig"><code> <span class="k">DEFINE</span> <span class="n">SRS</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SimpleRandomSample</span><span class="p">(</span><span class="s1">'0.01'</span><span class="p">);</span>
  <span class="n">examples</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">x</span><span class="p">:</span><span class="n">double</span><span class="p">,</span><span class="n">label</span><span class="p">:</span><span class="n">chararray</span><span class="p">);</span>
  <span class="n">grouped</span> <span class="o">=</span> <span class="k">GROUP</span> <span class="n">examples</span> <span class="k">BY</span> <span class="n">label</span><span class="p">;</span>
  <span class="n">sampled</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">grouped</span> <span class="k">GENERATE</span> <span class="k">FLATTEN</span><span class="p">(</span><span class="n">SRS</span><span class="p">(</span><span class="n">examples</span><span class="p">));</span>
-</pre>
-<h4 id="toc_3">Simple Random Sample With Replacement</h4>
+</code></pre>
+
+<h4 id="simple-random-sample-with-replacement">Simple Random Sample With Replacement</h4>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/sampling/SimpleRandomSampleWithReplacementVote.html">SimpleRandomSampleWithReplacementVote</a> and
 <a href="/docs/datafu/1.2.0/datafu/pig/sampling/SimpleRandomSampleWithReplacementElect.html">SimpleRandomSampleWithReplacementElect</a>
@@ -118,7 +126,7 @@ These can be used to generate a sample o
 
 <p>To use these UDFs, the user needs to provide the desired sample size and a good lower bound on the population
 size (or the exact size).  For example, to generate a sample of 100,000 without replacement:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">SRSWR_VOTE</span> 
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">SRSWR_VOTE</span> 
   <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SimpleRandomSampleWithReplacementVote</span><span class="p">();</span>
 <span class="k">DEFINE</span> <span class="n">SRSWR_ELECT</span> 
   <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SimpleRandomSampleWithReplacementElect</span><span class="p">();</span>
@@ -129,7 +137,8 @@ size (or the exact size).  For example,
   <span class="k">FLATTEN</span><span class="p">(</span><span class="n">SRSWR_VOTE</span><span class="p">(</span><span class="n">TOBAG</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="mi">100000</span><span class="p">,</span> <span class="n">summary</span><span class="p">.</span><span class="k">count</span><span class="p">));</span>
 <span class="n">sampled</span>    <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">candidates</span> <span class="k">BY</span> <span class="n">position</span> <span class="k">PARALLEL</span> <span class="mi">10</span><span class="p">)</span> <span class="k">GENERATE</span>
   <span class="k">FLATTEN</span><span class="p">(</span><span class="n">SRSWR_ELECT</span><span class="p">(</span><span class="n">candidates</span><span class="p">));</span>
-</pre>
+</code></pre>
+
 <p>Here we pass in the exact size for the lower bound.  Because of the way the algorithm works, we can use many reducers
 to generate the final set of sampled data.  This is why we use <code>PARALLEL 10</code>.  The parallel factor can be increased
 if necessary to distribute the work more.</p>
@@ -137,7 +146,7 @@ if necessary to distribute the work more
 <p>Sampling with replacement is used heavily in <a href="http://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29">bootstrapping</a>.
 For example, the following script generates 100 bootstrap samples, computes the mean value for each sample,
 and then outputs the bootstrap estimates.</p>
-<pre class="highlight pig"><span class="n">summary</span>    <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">item</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span>
+<pre class="highlight pig"><code><span class="n">summary</span>    <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">item</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span>
   <span class="k">AVG</span><span class="p">(</span><span class="n">item</span><span class="p">.</span><span class="n">x</span><span class="p">)</span> <span class="k">AS</span> <span class="n">mean</span><span class="p">,</span> <span class="k">COUNT</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">AS</span> <span class="k">count</span><span class="p">;</span>
 <span class="n">candidates</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">item</span> <span class="k">GENERATE</span> 
   <span class="k">FLATTEN</span><span class="p">(</span><span class="n">SRSWR_VOTE</span><span class="p">(</span><span class="n">TOBAG</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">summary</span><span class="p">.</span><span class="k">count</span><span class="o">*</span><span class="mi">100</span><span class="p">,</span> <span class="n">summary</span><span class="p">.</span><span class="k">count</span><span class="p">));</span>
@@ -146,8 +155,9 @@ and then outputs the bootstrap estimates
   <span class="k">AVG</span><span class="p">(</span><span class="n">SRSWR_ELECT</span><span class="p">(</span><span class="n">candidates</span><span class="p">))</span> <span class="k">AS</span> <span class="n">mean</span><span class="p">;</span>
 <span class="n">bootstrap</span>  <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">sampled</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> 
   <span class="n">summary</span><span class="p">.</span><span class="n">mean</span> <span class="k">AS</span> <span class="n">mean</span><span class="p">,</span> <span class="n">sampled</span><span class="p">.</span><span class="n">mean</span> <span class="k">AS</span> <span class="n">bootstrapMeans</span><span class="p">;</span>
-</pre>
-<h3 id="toc_4">Weighted Random Sampling</h3>
+</code></pre>
+
+<h3 id="weighted-random-sampling">Weighted Random Sampling</h3>
 
 <p>A weighted sample is similar to a simple random sample without replacement in that it generates a sample
 with a specific size.  The difference is that the probability of selecting each item can be different.
@@ -162,45 +172,50 @@ items&#39; weights.</p>
 <p>For example, suppose
 that we have a bag of four items: <code>a</code>, <code>b</code>, <code>c</code>, <code>d</code>.  For this bag, <code>a</code> has a weight of 100 and the remaining have a weight
 of 1.</p>
-<pre class="highlight pig"><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">A</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">name</span><span class="p">:</span><span class="n">chararray</span><span class="p">,</span><span class="n">score</span><span class="p">:</span><span class="n">int</span><span class="p">)});</span>
+<pre class="highlight pig"><code><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">A</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">name</span><span class="p">:</span><span class="n">chararray</span><span class="p">,</span><span class="n">score</span><span class="p">:</span><span class="n">int</span><span class="p">)});</span>
 <span class="cm">/* Contains a single bag:
 {(a,100),(b,1),(c,1),(d,1)}
 */</span>
-</pre>
+</code></pre>
+
 <p>We expect a weighted sample of this bag to contain <code>a</code> with very high probability.
 Let&#39;s generate a sample of size 3 from this bag.  To do this we pass in the bag, with 1 to indicate the weight
 is at index 1, and the sample size of 3.</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">WeightedSample</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">WeightedSample</span><span class="p">()</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">WeightedSample</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">WeightedSample</span><span class="p">()</span>
 
 <span class="n">result</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">WeightedSample</span><span class="p">(</span><span class="n">A</span><span class="p">,</span><span class="mi">1</span><span class="p">,</span><span class="mi">3</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>This is likely to generate output like this, where <code>a</code> tends to be present due to its high weight.</p>
-<pre class="highlight pig"><span class="k">DUMP</span> <span class="n">result</span><span class="p">;</span>
+<pre class="highlight pig"><code><span class="k">DUMP</span> <span class="n">result</span><span class="p">;</span>
 <span class="cm">/*
 ({(a,100),(c,5),(b,1)})
 */</span>
-</pre>
+</code></pre>
+
 <p>Alternatively, if we don&#39;t pass in sample size, <code>WeightedSample</code> will include all items, with the order being influenced
 by the item weights.</p>
-<pre class="highlight pig"><span class="n">result</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">WeightedSample</span><span class="p">(</span><span class="n">A</span><span class="p">,</span><span class="mi">1</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="n">result</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">WeightedSample</span><span class="p">(</span><span class="n">A</span><span class="p">,</span><span class="mi">1</span><span class="p">);</span>
 
 <span class="k">DUMP</span> <span class="n">result</span><span class="p">;</span>
 <span class="cm">/*
 ({(a,100),(c,5),(b,1),(d,1)})
 */</span>
-</pre>
+</code></pre>
+
 <p>One simple technique for generating weights that can be used with <code>WeightedSample</code> is to use DataFu&#39;s
 <a href="/docs/datafu/1.2.0/datafu/pig/bags/Enumerate.html">Enumerate</a> UDF, which can be used
 to append each item&#39;s tuple with its index within the bag.</p>
 
 <p>Again, suppose we have a bag with values <code>a</code>, <code>b</code>, <code>c</code>, <code>d</code>, but this time without weights.</p>
-<pre class="highlight pig"><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">A</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">name</span><span class="p">:</span><span class="n">chararray</span><span class="p">)});</span>
+<pre class="highlight pig"><code><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">A</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">name</span><span class="p">:</span><span class="n">chararray</span><span class="p">)});</span>
 <span class="cm">/* Contains a single bag:
 {(a),(b),(c),(d)}
 */</span>
-</pre>
+</code></pre>
+
 <p>Using <code>Enumerate</code>, we can append the index for each item and then compute a score from it.</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">Enumerate</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">Enumerate</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">Enumerate</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">Enumerate</span><span class="p">();</span>
 
 <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">Enumerate</span><span class="p">(</span><span class="n">A</span><span class="p">)</span> <span class="k">as</span> <span class="n">A</span><span class="p">;</span>
   <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="p">{</span>
@@ -211,26 +226,29 @@ to append each item&#39;s tuple with its
 <span class="cm">/* Produces:
 ({(a,1.0),(b,0.5),(c,0.3333333333333333),(d,0.25)})
 */</span>
-</pre>
+</code></pre>
+
 <p>This bag can then be passed into <code>WeightedSample</code>.  This produces a simple random sample where the items in the
 beginning of the bag are more likely to be selected.</p>
 
-<h3 id="toc_5">Consistently Sampling By Key</h3>
+<h3 id="consistently-sampling-by-key">Consistently Sampling By Key</h3>
 
 <p>A common use case for sampling is selecting a set of training examples for building a prediction model.
 For example, suppose that we have a recommendation system where we have tracked when items have been
 impressed to users and when they have clicked on them:</p>
-<pre class="highlight pig"><span class="n">impressions</span> <span class="o">=</span> 
+<pre class="highlight pig"><code><span class="n">impressions</span> <span class="o">=</span> 
   <span class="k">LOAD</span> <span class="s1">'$impressions'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span><span class="n">long</span><span class="p">);</span>
 <span class="n">clicks</span> <span class="o">=</span> 
   <span class="k">LOAD</span> <span class="s1">'$accepts'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span><span class="n">long</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>Using this data we would like to build a model that can predict user behavior so that we can show items
 to users that they are more likely to click on.  Since the data may be very large, we need to take a
 sample that is easier to work with.  We basically want to join on <code>(user_id,item_id)</code>, sample the result
 and product training data with the following format:</p>
-<pre class="highlight pig"><span class="p">{(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">is_impressed</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">is_clicked</span><span class="p">:</span><span class="n">int</span><span class="p">}</span>
-</pre>
+<pre class="highlight pig"><code><span class="p">{(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">is_impressed</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">is_clicked</span><span class="p">:</span><span class="n">int</span><span class="p">}</span>
+</code></pre>
+
 <p>The problem with this approach though is that the join can be very expensive if the data size is large.
 Sampling reduces the data size, but it has to be applied after the join because the same <code>(user_id,item_id)</code>
 pairs won&#39;t be selected from <code>impressions</code> and <code>clicks</code>.</p>
@@ -245,21 +263,24 @@ hash to the key, rather than by invoking
 
 <p>Let&#39;s see how we can apply this to our example above.  We want to take a 10% sample of the joined clicks
 and impressions.  We start by defining the UDF:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">SampleByKey</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SampleByKey</span><span class="p">(</span><span class="s1">'0.1'</span><span class="p">);</span>
-</pre>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">SampleByKey</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sampling</span><span class="p">.</span><span class="n">SampleByKey</span><span class="p">(</span><span class="s1">'0.1'</span><span class="p">);</span>
+</code></pre>
+
 <p>Since we are going to be joining on <code>(user_id,item_id)</code>, we need sample using this pair:</p>
-<pre class="highlight pig"><span class="n">impressions</span> <span class="o">=</span> <span class="k">FILTER</span> <span class="n">impressions</span> <span class="k">BY</span> <span class="n">SampleByKey</span><span class="p">(</span><span class="n">user_id</span><span class="p">,</span><span class="n">item_id</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="n">impressions</span> <span class="o">=</span> <span class="k">FILTER</span> <span class="n">impressions</span> <span class="k">BY</span> <span class="n">SampleByKey</span><span class="p">(</span><span class="n">user_id</span><span class="p">,</span><span class="n">item_id</span><span class="p">);</span>
 <span class="n">clicks</span> <span class="o">=</span> <span class="k">FILTER</span> <span class="n">clicks</span> <span class="k">BY</span> <span class="n">SampleByKey</span><span class="p">(</span><span class="n">user_id</span><span class="p">,</span><span class="n">item_id</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>We can now join the impressions and clicks, with the knowledge that the same <code>(user_id,item_id)</code>
 pairs will appear in both samples.</p>
-<pre class="highlight pig"><span class="n">joined_sample</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">COGROUP</span> <span class="n">impressions</span> <span class="k">BY</span> <span class="p">(</span><span class="n">user_id</span><span class="p">,</span><span class="n">item_id</span><span class="p">),</span>
+<pre class="highlight pig"><code><span class="n">joined_sample</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">COGROUP</span> <span class="n">impressions</span> <span class="k">BY</span> <span class="p">(</span><span class="n">user_id</span><span class="p">,</span><span class="n">item_id</span><span class="p">),</span>
                                  <span class="n">clicks</span> <span class="k">BY</span> <span class="p">(</span><span class="n">user_id</span><span class="p">,</span><span class="n">item_id</span><span class="p">))</span> <span class="k">GENERATE</span>
   <span class="k">group</span><span class="p">.</span><span class="n">user_id</span> <span class="k">as</span> <span class="n">user_id</span><span class="p">,</span>
   <span class="k">group</span><span class="p">.</span><span class="n">item_id</span> <span class="k">as</span> <span class="n">item_id</span><span class="p">,</span>
   <span class="p">((</span><span class="k">SIZE</span><span class="p">(</span><span class="n">impressions</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="o">?</span> <span class="mi">1</span> <span class="p">:</span> <span class="mi">0</span><span class="p">))</span> <span class="k">as</span> <span class="n">is_impressed</span><span class="p">,</span>  
   <span class="p">((</span><span class="k">SIZE</span><span class="p">(</span><span class="n">clicks</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="o">?</span> <span class="mi">1</span> <span class="p">:</span> <span class="mi">0</span><span class="p">))</span> <span class="k">as</span> <span class="n">is_clicked</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>Since we have sampled before joining the data, this should be much more efficient.</p>
 
     </div>
@@ -267,8 +288,9 @@ pairs will appear in both samples.</p>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/sessions.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/sessions.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/sessions.html (original)
+++ incubator/datafu/site/docs/datafu/guide/sessions.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Sessions - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_sessions">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,18 +67,18 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Sessions</h2>
+      <h2 id="sessions">Sessions</h2>
 
 <p>A &#39;session&#39; is a useful concept when analyzing user activity on a website.  We essentially
 define a session as sustained user activity.  By assigning events to sessions we can perform
@@ -80,14 +86,15 @@ analysis on user sessions and draw usefu
 
 <p>For example, suppose that we have a stream of page views by user.  Each page view can be
 represented by a member ID, a timestamp, and a URL:</p>
-<pre class="highlight pig"><span class="n">pv</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'pageviews.csv'</span> <span class="k">USING</span> <span class="n">PigStorage</span><span class="p">(</span><span class="s1">','</span><span class="p">)</span>
+<pre class="highlight pig"><code><span class="n">pv</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'pageviews.csv'</span> <span class="k">USING</span> <span class="n">PigStorage</span><span class="p">(</span><span class="s1">','</span><span class="p">)</span>
      <span class="k">AS</span> <span class="p">(</span><span class="n">memberId</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">time</span><span class="p">:</span><span class="n">long</span><span class="p">,</span> <span class="n">url</span><span class="p">:</span><span class="n">chararray</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>One statistic that may be useful to know is how long users tend to stay active on the website.
 When they visit do they tend to stick around for a long time and view many pages?  Or is it
 typically a very brief session?  Apache DataFu provides UDFs that help in this sort of analysis.  </p>
 
-<h3 id="toc_1">Sessionization</h3>
+<h3 id="sessionization">Sessionization</h3>
 
 <p>The <a href="/docs/datafu/1.2.0/datafu/pig/sessions/Sessionize.html">Sessionize</a>
 UDF can be used to assign unique session IDs to events within a stream.  Events are passed to the
@@ -100,35 +107,39 @@ event of each session.  With the session
 
 <p>First we need to choose a threshold for the <code>Sessionize</code> UDF.  We&#39;ll consider &quot;10 minutes&quot; a sufficient
 amount of time:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">Sessionize</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sessions</span><span class="p">.</span><span class="n">Sessionize</span><span class="p">(</span><span class="s1">'10m'</span><span class="p">);</span>
-</pre>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">Sessionize</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sessions</span><span class="p">.</span><span class="n">Sessionize</span><span class="p">(</span><span class="s1">'10m'</span><span class="p">);</span>
+</code></pre>
+
 <p>We&#39;ll also define functions to compute various statistics.  In this example we&#39;ll compute the median,
 90th and 95th percentiles, and variance of the session lengths.</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">Median</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingMedian</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">Median</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingMedian</span><span class="p">();</span>
 <span class="k">DEFINE</span> <span class="n">Quantile</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingQuantile</span><span class="p">(</span><span class="s1">'0.9'</span><span class="p">,</span><span class="s1">'0.95'</span><span class="p">);</span>
 <span class="k">DEFINE</span> <span class="n">VAR</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">VAR</span><span class="p">();</span>
-</pre>
+</code></pre>
+
 <p>Next we&#39;ll sessionize the data.  We group by member and sort the events by time.  <code>Sessionize</code> appends
 the session ID to each tuple.  Events for a member that are within 10 minutes of each other will be
 assigned to the same session.</p>
-<pre class="highlight pig"><span class="n">pv</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">pv</span> <span class="k">GENERATE</span> <span class="n">time</span><span class="p">,</span> <span class="n">memberId</span><span class="p">;</span>
+<pre class="highlight pig"><code><span class="n">pv</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">pv</span> <span class="k">GENERATE</span> <span class="n">time</span><span class="p">,</span> <span class="n">memberId</span><span class="p">;</span>
 <span class="n">pv_sessionized</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">pv</span> <span class="k">BY</span> <span class="n">memberId</span><span class="p">)</span> <span class="p">{</span>
   <span class="n">ordered</span> <span class="o">=</span> <span class="k">ORDER</span> <span class="n">pv</span> <span class="k">BY</span> <span class="n">time</span><span class="p">;</span>
   <span class="k">GENERATE</span> <span class="k">FLATTEN</span><span class="p">(</span><span class="n">Sessionize</span><span class="p">(</span><span class="n">ordered</span><span class="p">))</span>
            <span class="k">AS</span> <span class="p">(</span><span class="n">time</span><span class="p">,</span><span class="n">memberId</span><span class="p">,</span><span class="n">sessionId</span><span class="p">);</span>
 <span class="p">}</span>
-</pre>
+</code></pre>
+
 <p>Now that the data is sessionized, we can compute the session lengths:</p>
-<pre class="highlight pig"><span class="n">session_times</span> <span class="o">=</span>
+<pre class="highlight pig"><code><span class="n">session_times</span> <span class="o">=</span>
   <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">pv_sessionized</span> <span class="k">BY</span> <span class="p">(</span><span class="n">sessionId</span><span class="p">,</span><span class="n">memberId</span><span class="p">))</span> <span class="p">{</span>
     <span class="k">GENERATE</span> <span class="k">group</span><span class="p">.</span><span class="n">sessionId</span> <span class="k">as</span> <span class="n">sessionId</span><span class="p">,</span>
              <span class="k">group</span><span class="p">.</span><span class="n">memberId</span> <span class="k">as</span> <span class="n">memberId</span><span class="p">,</span>
              <span class="p">(</span><span class="k">MAX</span><span class="p">(</span><span class="n">pv_sessionized</span><span class="p">.</span><span class="n">time</span><span class="p">)</span> <span class="o">-</span> <span class="k">MIN</span><span class="p">(</span><span class="n">pv_sessionized</span><span class="p">.</span><span class="n">time</span><span class="p">))</span>
                <span class="o">/</span> <span class="mi">1000</span><span class="p">.</span><span class="mi">0</span> <span class="o">/</span> <span class="mi">60</span><span class="p">.</span><span class="mi">0</span> <span class="k">as</span> <span class="n">session_length</span><span class="p">;</span>
 <span class="p">}</span>
-</pre>
+</code></pre>
+
 <p>Finally let&#39;s compute our statistics:</p>
-<pre class="highlight pig"><span class="n">session_stats</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">session_times</span> <span class="k">ALL</span><span class="p">)</span> <span class="p">{</span>
+<pre class="highlight pig"><code><span class="n">session_stats</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">session_times</span> <span class="k">ALL</span><span class="p">)</span> <span class="p">{</span>
   <span class="k">GENERATE</span>
     <span class="k">AVG</span><span class="p">(</span><span class="n">session_times</span><span class="p">.</span><span class="n">session_length</span><span class="p">)</span> <span class="k">as</span> <span class="n">avg_session</span><span class="p">,</span>
     <span class="k">SQRT</span><span class="p">(</span><span class="n">VAR</span><span class="p">(</span><span class="n">session_times</span><span class="p">.</span><span class="n">session_length</span><span class="p">))</span> <span class="k">as</span> <span class="n">std_dev_session</span><span class="p">,</span>
@@ -137,18 +148,20 @@ assigned to the same session.</p>
 <span class="p">}</span>
 
 <span class="k">DUMP</span> <span class="n">session_stats</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>With the session statistics computed, we can now perform some interesting queries.  For example,
 let&#39;s get the list of users who had sessions in the upper 95th percentile.  These are the users
 who were most engaged in our website.</p>
-<pre class="highlight pig"><span class="n">long_sessions</span> <span class="o">=</span> <span class="k">FILTER</span> <span class="n">session_times</span> <span class="k">BY</span>
+<pre class="highlight pig"><code><span class="n">long_sessions</span> <span class="o">=</span> <span class="k">FILTER</span> <span class="n">session_times</span> <span class="k">BY</span>
   <span class="n">session_length</span> <span class="o">&gt;</span> <span class="n">session_stats</span><span class="p">.</span><span class="n">quantiles_session</span><span class="p">.</span><span class="n">quantile_0_95</span><span class="p">;</span>
 
 <span class="n">very_engaged_users</span> <span class="o">=</span> <span class="k">DISTINCT</span> <span class="p">(</span><span class="k">FOREACH</span> <span class="n">long_sessions</span> <span class="k">GENERATE</span> <span class="n">memberId</span><span class="p">);</span>
 
 <span class="k">DUMP</span> <span class="n">very_engaged_users</span><span class="p">;</span>
-</pre>
-<h3 id="toc_2">Counting Sessions</h3>
+</code></pre>
+
+<h3 id="counting-sessions">Counting Sessions</h3>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/sessions/SessionCount.html">SessionCount</a>
 can be used to count sessions.  It works very similarly to
@@ -161,30 +174,34 @@ additional page views are significant an
 <code>SessionCount</code> can help with this.</p>
 
 <p>First we&#39;ll define the UDF and specify a 10 minute threshold:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">SessionCount</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sessions</span><span class="p">.</span><span class="n">SessionCount</span><span class="p">(</span><span class="s1">'10m'</span><span class="p">);</span>
-</pre>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">SessionCount</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sessions</span><span class="p">.</span><span class="n">SessionCount</span><span class="p">(</span><span class="s1">'10m'</span><span class="p">);</span>
+</code></pre>
+
 <p>We then perform the same procedure as before, sorting the events by time and passing them into the UDF.
 This time we get a count as output instead of a bag of sessionized events.</p>
-<pre class="highlight pig"><span class="n">pv_sessionized</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">pv</span> <span class="k">BY</span> <span class="p">(</span><span class="n">memberId</span><span class="p">,</span><span class="n">url</span><span class="p">))</span> <span class="p">{</span>
+<pre class="highlight pig"><code><span class="n">pv_sessionized</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">pv</span> <span class="k">BY</span> <span class="p">(</span><span class="n">memberId</span><span class="p">,</span><span class="n">url</span><span class="p">))</span> <span class="p">{</span>
   <span class="n">ordered</span> <span class="o">=</span> <span class="k">ORDER</span> <span class="n">pv</span> <span class="k">BY</span> <span class="n">time</span><span class="p">;</span>
   <span class="k">GENERATE</span> <span class="k">group</span><span class="p">.</span><span class="n">memberId</span> <span class="k">as</span> <span class="n">memberId</span><span class="p">,</span>
            <span class="k">group</span><span class="p">.</span><span class="n">url</span> <span class="k">as</span> <span class="n">url</span><span class="p">,</span>
            <span class="k">FLATTEN</span><span class="p">(</span><span class="n">SessionCount</span><span class="p">(</span><span class="n">ordered</span><span class="p">.</span><span class="n">time</span><span class="p">))</span> <span class="k">as</span> <span class="k">count</span><span class="p">;</span>
 <span class="p">}</span>
-</pre>
+</code></pre>
+
 <p>We now have the page view counts grouped by member and URL.  Now we can perform one more group to get the
 total page views across all members and URLs.</p>
-<pre class="highlight pig"><span class="n">pv_sum</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">pv_sessionized</span> <span class="k">ALL</span><span class="p">)</span> 
+<pre class="highlight pig"><code><span class="n">pv_sum</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">pv_sessionized</span> <span class="k">ALL</span><span class="p">)</span> 
          <span class="k">GENERATE</span> <span class="k">SUM</span><span class="p">(</span><span class="n">pv_sessionized</span><span class="p">.</span><span class="k">count</span><span class="p">)</span> <span class="k">as</span> <span class="n">total_pvs</span><span class="p">;</span>
 <span class="k">DUMP</span> <span class="n">pv_sum</span><span class="p">;</span>
-</pre>
+</code></pre>
+
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/set-operations.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/set-operations.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/set-operations.html (original)
+++ incubator/datafu/site/docs/datafu/guide/set-operations.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Set Operations - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_set-operations">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,25 +67,25 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Set Operations</h2>
+      <h2 id="set-operations">Set Operations</h2>
 
 <p>Apache DataFu has several methods for performing <a href="http://en.wikipedia.org/wiki/Set_%28mathematics%29">set operations</a> on bags.</p>
 
-<h3 id="toc_1">Set Intersection</h3>
+<h3 id="set-intersection">Set Intersection</h3>
 
 <p>Compute the <a href="http://en.wikipedia.org/wiki/Set_%28mathematics%29#Intersections">set intersection</a> with <a href="/docs/datafu/1.2.0/datafu/pig/sets/SetIntersect.html">SetIntersect</a>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">SetIntersect</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sets</span><span class="p">.</span><span class="n">SetIntersect</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">SetIntersect</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sets</span><span class="p">.</span><span class="n">SetIntersect</span><span class="p">();</span>
 
 <span class="c1">-- ({(3),(4),(1),(2),(7),(5),(6)},{(0),(5),(10),(1),(4)})
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">B1</span><span class="p">:</span><span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span><span class="n">tuple</span><span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">)},</span><span class="n">B2</span><span class="p">:</span><span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span><span class="n">tuple</span><span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">)});</span>
@@ -92,11 +98,12 @@
 
 <span class="c1">-- produces: ({(1),(4),(5)})
 </span><span class="k">DUMP</span> <span class="n">intersected</span><span class="p">;</span>
-</pre>
-<h3 id="toc_2">Set Union</h3>
+</code></pre>
+
+<h3 id="set-union">Set Union</h3>
 
 <p>Compute the <a href="http://en.wikipedia.org/wiki/Set_%28mathematics%29#Unions">set union</a> with <a href="/docs/datafu/1.2.0/datafu/pig/sets/SetUnion.html">SetUnion</a>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">SetUnion</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sets</span><span class="p">.</span><span class="n">SetUnion</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">SetUnion</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sets</span><span class="p">.</span><span class="n">SetUnion</span><span class="p">();</span>
 
 <span class="c1">-- ({(3),(4),(1),(2),(7),(5),(6)},{(0),(5),(10),(1),(4)})
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">B1</span><span class="p">:</span><span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span><span class="n">tuple</span><span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">)},</span><span class="n">B2</span><span class="p">:</span><span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span><span class="n">tuple</span><span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">)});</span>
@@ -105,14 +112,16 @@
 
 <span class="c1">-- produces: ({(3),(4),(1),(2),(7),(5),(6),(0),(10)})
 </span><span class="k">DUMP</span> <span class="n">unioned</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>This can also operate on multiple bags:</p>
-<pre class="highlight pig"><span class="n">intersected</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">SetUnion</span><span class="p">(</span><span class="n">B1</span><span class="p">,</span><span class="n">B2</span><span class="p">,</span><span class="n">B3</span><span class="p">);</span>
-</pre>
-<h3 id="toc_3">Set Difference</h3>
+<pre class="highlight pig"><code><span class="n">intersected</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">SetUnion</span><span class="p">(</span><span class="n">B1</span><span class="p">,</span><span class="n">B2</span><span class="p">,</span><span class="n">B3</span><span class="p">);</span>
+</code></pre>
+
+<h3 id="set-difference">Set Difference</h3>
 
 <p>Compute the <a href="http://en.wikipedia.org/wiki/Set_%28mathematics%29#Complements">set difference</a> with <a href="/docs/datafu/1.2.0/datafu/pig/sets/SetDifference.html">SetDifference</a>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">SetDifference</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sets</span><span class="p">.</span><span class="n">SetDifference</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">SetDifference</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">sets</span><span class="p">.</span><span class="n">SetDifference</span><span class="p">();</span>
 
 <span class="c1">-- ({(3),(4),(1),(2),(7),(5),(6)},{(1),(3),(5),(12)})
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">B1</span><span class="p">:</span><span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span><span class="n">tuple</span><span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">)},</span><span class="n">B2</span><span class="p">:</span><span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span><span class="n">tuple</span><span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">)});</span>
@@ -126,14 +135,16 @@
 
 <span class="c1">-- produces: ({(2),(4),(6),(7)})
 </span><span class="k">DUMP</span> <span class="n">differenced</span><span class="p">;</span>
-</pre>
+</code></pre>
+
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/statistics.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/statistics.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/statistics.html (original)
+++ incubator/datafu/site/docs/datafu/guide/statistics.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Statistics - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_statistics">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,20 +67,20 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Statistics</h2>
+      <h2 id="statistics">Statistics</h2>
 
-<h3 id="toc_1">Median</h3>
+<h3 id="median">Median</h3>
 
 <p>Apache DataFu has two UDFs that can be used to compute the <a href="http://en.wikipedia.org/wiki/Median">median</a> of a bag.
 <a href="/docs/datafu/1.2.0/datafu/pig/stats/Median.html">Median</a> computes the median exactly, but
@@ -83,15 +89,16 @@ on the other hand, does not require that
 the input bag to be sorted, it is more efficient.</p>
 
 <p>Let&#39;s take a look at computing the median using <code>StreamingMedian</code>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">Median</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingMedian</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">Median</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingMedian</span><span class="p">();</span>
 
 <span class="c1">-- input: 3,5,4,1,2
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">);</span>
 
 <span class="c1">-- produces: 3
 </span><span class="n">medians</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">input</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="n">Median</span><span class="p">(</span><span class="n">input</span><span class="p">.</span><span class="n">val</span><span class="p">);</span>
-</pre>
-<h3 id="toc_2">Quantiles</h3>
+</code></pre>
+
+<h3 id="quantiles">Quantiles</h3>
 
 <p><a href="http://en.wikipedia.org/wiki/Quantile">Quantiles</a> are points at regular intervals within an ordered data set.  Essentially
 we divide an ordered data set into segments, and the quantiles are the values between the segments.  The quantiles people are probably
@@ -103,33 +110,36 @@ and <a href="/docs/datafu/1.2.0/datafu/p
 the quantiles of a bag that does not need to be sorted.</p>
 
 <p>Let&#39;s take a look at computing the median using <code>StreamingQuantile</code>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">Quantile</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingQuantile</span><span class="p">(</span><span class="s1">'0.0'</span><span class="p">,</span><span class="s1">'0.5'</span><span class="p">,</span><span class="s1">'1.0'</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">Quantile</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingQuantile</span><span class="p">(</span><span class="s1">'0.0'</span><span class="p">,</span><span class="s1">'0.5'</span><span class="p">,</span><span class="s1">'1.0'</span><span class="p">);</span>
 
 <span class="c1">-- input: 9,10,2,3,5,8,1,4,6,7
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">);</span>
 
 <span class="c1">-- produces: (1,5.5,10)
 </span><span class="n">quantiles</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">input</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="n">Quantile</span><span class="p">(</span><span class="n">input</span><span class="p">.</span><span class="n">val</span><span class="p">);</span>
-</pre>
-<h3 id="toc_3">Variance</h3>
+</code></pre>
+
+<h3 id="variance">Variance</h3>
 
 <p><a href="http://en.wikipedia.org/wiki/Variance">Variance</a> can be computed using the <a href="/docs/datafu/1.2.0/datafu/pig/stats/VAR.html">VAR</a>
 UDF:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">VAR</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">VAR</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">VAR</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">VAR</span><span class="p">();</span>
 
 <span class="c1">-- input: 1,2,3,4,5,6,7,8,9
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">);</span>
 
 <span class="c1">-- produces: 6.666666666666668
 </span><span class="n">variance</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">input</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="n">VAR</span><span class="p">(</span><span class="n">input</span><span class="p">.</span><span class="n">val</span><span class="p">);</span>
-</pre>
+</code></pre>
+
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>