You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datafu.apache.org by mh...@apache.org on 2015/10/21 19:00:40 UTC

svn commit: r1709884 [4/8] - in /incubator/datafu/site: ./ blog/ blog/2012/01/10/ blog/2013/01/24/ blog/2013/09/04/ blog/2013/10/03/ blog/2014/04/27/ community/ docs/ docs/datafu/ docs/datafu/guide/ docs/hourglass/ javascripts/ stylesheets/

Modified: incubator/datafu/site/community/mailing-lists.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/community/mailing-lists.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/community/mailing-lists.html (original)
+++ incubator/datafu/site/community/mailing-lists.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -8,13 +10,11 @@
     <meta name="google-site-verification" content="9N7qTOUYyX4kYfXYc0OIomWJku3PVvGrf6oTNWg2CHI" />
     
     <!-- Use title if it's in the page YAML frontmatter -->
-    <title>Mailing Lists - Apache DataFu Community</title>
+    <title>Mailing Lists - Community</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="community community_mailing-lists">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,18 +67,18 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
-      
-      <h1 id="toc_0">Mailing Lists</h1>
+      <h4 class="text-muted">Community</h4>
+      <h1 id="mailing-lists">Mailing Lists</h1>
 
 <p>We have a few mailing lists hosted by Apache:</p>
 
@@ -88,8 +94,9 @@ A list to track DataFu commits.</p></li>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/getting-started.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/getting-started.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/getting-started.html (original)
+++ incubator/datafu/site/docs/datafu/getting-started.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Getting Started - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_getting-started">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,18 +67,18 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig</h4>
-      <h1 id="toc_0">Getting Started</h1>
+      <h1 id="getting-started">Getting Started</h1>
 
 <p>Apache DataFu Pig is a collection of user-defined functions for working with large scale data in <a href="http://pig.apache.org/">Apache Pig</a>.  It has a number of useful functions available:</p>
 
@@ -129,31 +135,13 @@
 <p>If you&#39;d like to read more details about these functions, check out the <a href="/docs/datafu/guide.html">Guide</a>.  Otherwise if you are
 ready to get started using DataFu Pig, keep reading.</p>
 
-<h2 id="toc_1">Download</h2>
-
-<p>DataFu Pig is available as a JAR that can be downloaded and registered with Pig.  It can be found in the Maven central repository
-under the group ID <a href="http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22com.linkedin.datafu%22">com.linkedin.datafu</a> by the
-name <code>datafu</code>.</p>
-
-<p>If you are using Ivy, you can download <code>datafu</code> and its dependencies with:</p>
-<pre class="highlight xml"><span class="nt">&lt;dependency</span> <span class="na">org=</span><span class="s">&quot;com.linkedin.datafu&quot;</span> <span class="na">name=</span><span class="s">&quot;datafu&quot;</span> <span class="na">rev=</span><span class="s">&quot;1.2.0&quot;</span><span class="nt">/&gt;</span>
-</pre>
-<p>Or if you are using Maven:</p>
-<pre class="highlight xml"><span class="nt">&lt;dependency&gt;</span>
-  <span class="nt">&lt;groupId&gt;</span>com.linkedin.datafu<span class="nt">&lt;/groupId&gt;</span>
-  <span class="nt">&lt;artifactId&gt;</span>datafu<span class="nt">&lt;/artifactId&gt;</span>
-  <span class="nt">&lt;version&gt;</span>1.2.0<span class="nt">&lt;/version&gt;</span>
-<span class="nt">&lt;/dependency&gt;</span>
-</pre>
-<p>Your other option is to <a href="https://github.com/linkedin/datafu/archive/master.zip">download</a> the code and build the JAR yourself.
-After unzipping the archive you can build the JAR by running <code>ant jar</code>.  The dependencies will be 
-downloaded to <code>lib/common</code>.</p>
+<p>The rest of this page assumes you already have a built JAR available.  If this is not the case, please see <a href="/docs/quick-start.html">Quick Start</a>.</p>
 
-<h2 id="toc_2">Basic Example: Computing Median</h2>
+<h2 id="basic-example-computing-median">Basic Example: Computing Median</h2>
 
-<p>Now that we have downloaded DataFu, let&#39;s use it to perform a very basic task: computing the median of some data.
+<p>Let&#39;s use DataFu Pig to perform a very basic task: computing the median of some data.
 Suppose we have a file <code>input</code> in Hadoop with the following content:</p>
-<pre class="highlight text">1
+<pre class="highlight plaintext"><code>1
 2
 3
 2
@@ -163,34 +151,40 @@ Suppose we have a file <code>input</code
 2
 2
 1
-</pre>
+</code></pre>
+
 <p>We can clearly see that the median is 2 for this data set.  First we&#39;ll start up Pig&#39;s grunt shell by running <code>pig</code> and
 then register the DataFu JAR:</p>
-<pre class="highlight pig"><span class="k">register</span> <span class="n">datafu</span><span class="o">-</span><span class="mi">1</span><span class="p">.</span><span class="mi">2</span><span class="p">.</span><span class="mi">0</span><span class="p">.</span><span class="n">jar</span>
-</pre>
+<pre class="highlight pig"><code><span class="k">register</span> <span class="n">datafu</span><span class="o">-</span><span class="mi">1</span><span class="p">.</span><span class="mi">3</span><span class="p">.</span><span class="mi">0</span><span class="o">-</span><span class="n">SNAPSHOT</span><span class="p">.</span><span class="n">jar</span>
+</code></pre>
+
 <p>To compute the median we&#39;ll use DataFu&#39;s <code>StreamingMedian</code>, which computes an estimate of the median but has the benefit
 of not requiring the data to be sorted:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">Median</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingMedian</span><span class="p">();</span>
-</pre>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">Median</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">StreamingMedian</span><span class="p">();</span>
+</code></pre>
+
 <p>Next we can load the data and pass it into the function to compute the median:</p>
-<pre class="highlight pig"><span class="n">data</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">using</span> <span class="n">PigStorage</span><span class="p">()</span> <span class="k">as</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="n">data</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">using</span> <span class="n">PigStorage</span><span class="p">()</span> <span class="k">as</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">int</span><span class="p">);</span>
 <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">data</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="n">Median</span><span class="p">(</span><span class="n">data</span><span class="p">);</span>
 <span class="k">DUMP</span> <span class="n">data</span>
-</pre>
+</code></pre>
+
 <p>This produces the expected output:</p>
-<pre class="highlight text">((2.0))
-</pre>
-<h2 id="toc_3">Next Steps</h2>
+<pre class="highlight plaintext"><code>((2.0))
+</code></pre>
 
-<p>Check out the <a href="/docs/datafu/guide.html">Guide</a> for more information on what you can do with DataFu.</p>
+<h2 id="next-steps">Next Steps</h2>
+
+<p>Check out the <a href="/docs/datafu/guide.html">Guide</a> for more information on what you can do with DataFu Pig.</p>
 
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide.html (original)
+++ incubator/datafu/site/docs/datafu/guide.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,18 +67,18 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig</h4>
-      <h1 id="toc_0">Guide</h1>
+      <h1 id="guide">Guide</h1>
 
 <p>Apache DataFu Pig is a collection of user-defined functions for working with large scale data in <a href="https://pig.apache.org/">Apache Pig</a>.
 It has a number of useful functions available.  This guide provides examples of how to use these functiosn and serves as an overview for working with the library.</p>
@@ -90,15 +96,15 @@ It has a number of useful functions avai
 </ul>
 
 <p>There is also <a href="/docs/datafu/javadoc.html">Javadoc</a> available for all UDFs in the library.  We continue to add
-UDFs to the library.  If you are interested in helping out please follow the <a href="/docs/datafu/contributing.html">Contributing</a>
+UDFs to the library.  If you are interested in helping out please follow the <a href="/community/contributing.html">Contributing</a>
 guide.</p>
 
-<h2 id="toc_1">Pig Compatibility</h2>
+<h2 id="pig-compatibility">Pig Compatibility</h2>
 
 <p>The current version of DataFu has been tested against Pig 0.11.1 and 0.12.0.  DataFu should be compatible with some older versions of Pig, however we do not do any sort of testing with prior versions of Pig and do not guarantee compatibility.
-Our policy is to test against the most recent version of Pig whenever we release and make sure DataFu works with that version. </p>
+Our policy is to test against the most recent version of Pig whenever we release and make sure DataFu works with that version.</p>
 
-<h2 id="toc_2">Blog Posts</h2>
+<h2 id="blog-posts">Blog Posts</h2>
 
 <ul>
 <li><a href="/blog/2012/01/10/introducing-datafu.html">Introducing DataFu</a></li>
@@ -106,7 +112,7 @@ Our policy is to test against the most r
 <li><a href="/blog/2013/09/04/datafu-1-0.html">DataFu 1.0</a></li>
 </ul>
 
-<h2 id="toc_3">Slides</h2>
+<h2 id="slides">Slides</h2>
 
 <ul>
 <li><a href="http://www.slideshare.net/matthewterencehayes/datafu">A Brief Tour of DataFu</a></li>
@@ -114,7 +120,7 @@ Our policy is to test against the most r
 <li><a href="http://www.slideshare.net/williamgvaughan/datafu-apachecon-33420740">DataFu @ ApacheCon 2014</a></li>
 </ul>
 
-<h2 id="toc_4">Videos</h2>
+<h2 id="videos">Videos</h2>
 
 <ul>
 <li><a href="http://www.youtube.com/watch?v=JWI9tVsQ1cY">Introduction to Apache DataFu @ ApacheCon North America 2014</a></li>
@@ -125,8 +131,9 @@ Our policy is to test against the most r
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/bag-operations.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/bag-operations.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/bag-operations.html (original)
+++ incubator/datafu/site/docs/datafu/guide/bag-operations.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Bag Operations - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_bag-operations">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,25 +67,25 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Bag operations</h2>
+      <h2 id="bag-operations">Bag operations</h2>
 
 <p>Often when working with data in Pig, it makes sense to keep the data grouped by one or more fields, 
 which means you are working with bags.  Unfortunately there aren&#39;t many convenient ways to work 
 with bags in Pig out of the box.  For this reason Apache DataFu provides several UDFs for performing useful
 operations on bags that come up in practice.</p>
 
-<h3 id="toc_1">Counting Items in Bags</h3>
+<h3 id="counting-items-in-bags">Counting Items in Bags</h3>
 
 <p>The <a href="/docs/datafu/1.2.0/datafu/pig/bags/CountEach.html">CountEach</a> UDF
 can be used to count the number of instances of items within a bag.  It produces a new bag of the
@@ -87,45 +93,50 @@ distinct items with their respective cou
 
 <p>Let&#39;s take a look at an example where this might be useful.
 Suppose that we have a recommendation system, and we&#39;ve tracked what items have been recommended.</p>
-<pre class="highlight pig"><span class="n">items</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">items</span> <span class="k">GENERATE</span> <span class="n">memberId</span><span class="p">,</span> <span class="n">itemId</span><span class="p">;</span>
-</pre>
+<pre class="highlight pig"><code><span class="n">items</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">items</span> <span class="k">GENERATE</span> <span class="n">memberId</span><span class="p">,</span> <span class="n">itemId</span><span class="p">;</span>
+</code></pre>
+
 <p>Let&#39;s say that we want to compute the number of times an item has been shown to each user.
 Our output will have this schema:</p>
-<pre class="highlight text">{memberId:int, items: {{itemId:long, cnt:long}}}
-</pre>
+<pre class="highlight json"><code><span class="p">{</span><span class="err">memberId:int,</span><span class="w"> </span><span class="err">items:</span><span class="w"> </span><span class="err">{{itemId:long,</span><span class="w"> </span><span class="err">cnt:long</span><span class="p">}</span><span class="err">}}</span><span class="w">
+</span></code></pre>
+
 <p>Typically we would have to perform to <code>GROUP</code> operations to get this output.  First we group
 by <code>(memberId,itemId)</code>, count, and then group a second time.  This requires two MapReduce jobs.</p>
 
 <p>To make this case more efficient, we can use the <code>CountEach</code> UDF.
 It will produce the same output, but it only requires a single <code>GROUP</code> operation:</p>
-<pre class="highlight pig"><span class="k">DEFINE</span> <span class="n">CountEach</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">CountEach</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">DEFINE</span> <span class="n">CountEach</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">CountEach</span><span class="p">();</span>
 
 <span class="n">items</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">items</span> <span class="k">BY</span> <span class="n">memberId</span><span class="p">)</span> <span class="k">GENERATE</span>
   <span class="k">group</span> <span class="k">as</span> <span class="n">memberId</span><span class="p">,</span>
   <span class="n">CountEach</span><span class="p">(</span><span class="n">items</span><span class="p">.(</span><span class="n">itemId</span><span class="p">))</span> <span class="k">as</span> <span class="n">items</span><span class="p">;</span>
-</pre>
-<h3 id="toc_2">Bag Concatenation</h3>
+</code></pre>
+
+<h3 id="bag-concatenation">Bag Concatenation</h3>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/bags/BagConcat.html">BagConcat</a> can be used
 to concatenate the tuples from two or more bags into a single bag:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">BagConcat</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagConcat</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">BagConcat</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagConcat</span><span class="p">();</span>
 
 <span class="c1">-- ({(1),(2),(3)},{(4),(5)},{(6),(7)})
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">B1</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">)},</span> <span class="n">B2</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">)},</span> <span class="n">B3</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</sp
 an><span class="p">:</span><span class="n">INT</span><span class="p">)});</span>
 
 <span class="c1">-- ({(1),(2),(3),(4),(5),(6),(7)})
 </span><span class="n">output</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">BagConcat</span><span class="p">(</span><span class="n">B1</span><span class="p">,</span><span class="n">B2</span><span class="p">,</span><span class="n">B3</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p><a href="/docs/datafu/1.2.0/datafu/pig/bags/BagConcat.html">BagConcat</a> can also be 
 used to concatenate all tuples present in a bag of bags.</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">BagConcat</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagConcat</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">BagConcat</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagConcat</span><span class="p">();</span>
 <span class="c1">-- ({({(1),(2),(3)}),({(3),(4),(5)})})
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">A</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">bag</span><span class="p">{</span><span class="n">T2</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">)})});</span>
 
 <span class="c1">-- ({(1),(2),(3),(3),(4),(5)})
 </span><span class="n">output</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">BagConcat</span><span class="p">(</span><span class="n">A</span><span class="p">);</span>
-</pre>
-<h3 id="toc_3">Grouping Within a Bag</h3>
+</code></pre>
+
+<h3 id="grouping-within-a-bag">Grouping Within a Bag</h3>
 
 <p>Pig has a <code>GROUP</code> operation that can be applied to a relation.  It produces a new relation where the input
 tuples are grouped by a particular key.  A bag in the relation contains the grouped tuples for that key.  The key
@@ -139,7 +150,7 @@ this bag without involving <code>GROUP</
 <p>In the following example we take an <code>input_bag</code> consisting of key-value pairs <code>(k,v)</code> and group the tuples by <code>k</code>.
 This produces a new bag having tuples consisting of <code>group</code> and <code>input_bag</code>.  The <code>group</code> corresponds to the grouping
 key <code>k</code>.  The <code>input_bag</code> is a bag containing the tuples from the original <code>input_bag</code> that have the same <code>k</code> as <code>group</code>.</p>
-<pre class="highlight pig"> <span class="k">define</span> <span class="n">BagGroup</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagGroup</span><span class="p">();</span>
+<pre class="highlight pig"><code> <span class="k">define</span> <span class="n">BagGroup</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagGroup</span><span class="p">();</span>
 
  <span class="n">data</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">input_bag</span><span class="p">:</span> <span class="n">bag</span> <span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">k</span><span class="p">:</span> <span class="n">int</span><span class="p">,</span> <span class="n">v</span><span class="p">:</span> <span class="n">chararray</span><span class="p">)});</span>
  <span class="c1">-- ({(1,A),(1,B),(2,A),(2,B),(2,C),(3,A)})
@@ -148,10 +159,11 @@ key <code>k</code>.  The <code>input_bag
 </span> <span class="n">data2</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">BagGroup</span><span class="p">(</span><span class="n">input_bag</span><span class="p">,</span> <span class="n">input_bag</span><span class="p">.(</span><span class="n">k</span><span class="p">))</span> <span class="k">as</span> <span class="n">grouped</span><span class="p">;</span>
  <span class="c1">-- data2: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
 </span> <span class="c1">-- ({(1,{(1,A),(1,B)}),(2,{(2,A),(2,B),(2,C)}),(3,{(3,A)})})
-</span></pre>
+</span></code></pre>
+
 <p>We could also project out the key from the final <code>input_bag</code> using a nested <code>FOREACH</code> so that the bag only
 consists of the value <code>v</code>:</p>
-<pre class="highlight pig"><span class="n">data3</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data2</span> <span class="p">{</span>
+<pre class="highlight pig"><code><span class="n">data3</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data2</span> <span class="p">{</span>
   <span class="c1">-- project only the value
 </span>  <span class="n">projected</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">grouped</span> <span class="k">GENERATE</span> <span class="k">group</span><span class="p">,</span> <span class="n">input_bag</span><span class="p">.(</span><span class="n">v</span><span class="p">);</span>
   <span class="k">GENERATE</span> <span class="n">projected</span> <span class="k">as</span> <span class="n">grouped</span><span class="p">;</span>
@@ -159,32 +171,35 @@ consists of the value <code>v</code>:</p
 
 <span class="c1">-- data3: {grouped: {(group: int,input_bag: {T: (k: int,v: chararray)})}}
 -- ({(1,{(A),(B)}),(2,{(A),(B),(C)}),(3,{(A)})})
-</span></pre>
-<h3 id="toc_4">Append to Bag</h3>
+</span></code></pre>
+
+<h3 id="append-to-bag">Append to Bag</h3>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/bags/AppendToBag.html">AppendToBag</a> can be
 used to append a tuple to a bag:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">AppendToBag</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">AppendToBag</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">AppendToBag</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">AppendToBag</span><span class="p">();</span>
 
 <span class="c1">-- ({(1),(2),(3)},(4))
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">B</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">)},</span> <span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">));</span>
 
 <span class="c1">-- ({(1),(2),(3),(4)})
 </span><span class="n">output</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">AppendToBag</span><span class="p">(</span><span class="n">B</span><span class="p">,</span><span class="n">T</span><span class="p">);</span>
-</pre>
-<h3 id="toc_5">Prepend to Bag</h3>
+</code></pre>
+
+<h3 id="prepend-to-bag">Prepend to Bag</h3>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/bags/PrependToBag.html">PrependToBag</a> can be
 used to prepend a tuple to a bag:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">PrependToBag</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">PrependToBag</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">PrependToBag</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">PrependToBag</span><span class="p">();</span>
 
 <span class="c1">-- ({(1),(2),(3)},(4))
 </span><span class="n">input</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">B</span><span class="p">:</span> <span class="n">bag</span><span class="p">{</span><span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">)},</span> <span class="n">T</span><span class="p">:</span> <span class="n">tuple</span><span class="p">(</span><span class="n">v</span><span class="p">:</span><span class="n">INT</span><span class="p">));</span>
 
 <span class="c1">-- ({(4),(1),(2),(3)})
 </span><span class="n">output</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">input</span> <span class="k">GENERATE</span> <span class="n">PrependToBag</span><span class="p">(</span><span class="n">B</span><span class="p">,</span><span class="n">T</span><span class="p">);</span>
-</pre>
-<h3 id="toc_6">Join Bags</h3>
+</code></pre>
+
+<h3 id="join-bags">Join Bags</h3>
 
 <p>Pig has a <code>JOIN</code> operator, but unfortunately it only operates on relations.  Thus, if you wish to join
 tuples from two bags, you must first flatten, then join, then re-group.  To make this process simpler DataFu
@@ -194,18 +209,20 @@ UDF.</p>
 <p>Let&#39;s walk through an example where this is useful.  Suppose that we are building a recommendation system.
 This system recommends items to users, and these recommendations may be ignored, accepted, or rejected.
 When analyzing this system, we have a stream of impression, accept, and reject events:</p>
-<pre class="highlight pig"><span class="n">impressions</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'$impressions'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span><span class="n">long</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="n">impressions</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'$impressions'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span><span class="n">long</span><span class="p">);</span>
 <span class="n">accepts</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'$accepts'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span><span class="n">long</span><span class="p">);</span>
 <span class="n">rejects</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'$rejects'</span> <span class="k">AS</span> <span class="p">(</span><span class="n">user_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">item_id</span><span class="p">:</span><span class="n">int</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span><span class="n">long</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>What we want to produce from this data is a bag of item counts per member:</p>
-<pre class="highlight text">features: {user_id:int, items:{(item_id:int, impression_count:int, accept_count:int, reject_count:int)}}
-</pre>
+<pre class="highlight plaintext"><code>features: {user_id:int, items:{(item_id:int, impression_count:int, accept_count:int, reject_count:int)}}
+</code></pre>
+
 <p>Using DataFu&#39;s
 <a href="/docs/datafu/1.2.0/datafu/pig/bags/CountEach.html">CountEach</a>
 we can efficiently produce the counts per item for impressions, accepts, and rejects as separate
 bags per member using a single MapReduce job:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">CountEach</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">CountEach</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">CountEach</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">CountEach</span><span class="p">();</span>
 
 <span class="n">features_counted</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">COGROUP</span> <span class="n">impressions</span> <span class="k">BY</span> <span class="n">user_id</span><span class="p">,</span> 
                                     <span class="n">accepts</span> <span class="k">BY</span> <span class="n">user_id</span><span class="p">,</span> 
@@ -214,10 +231,11 @@ bags per member using a single MapReduce
   <span class="n">CountEach</span><span class="p">(</span><span class="n">impressions</span><span class="p">.</span><span class="n">item_id</span><span class="p">)</span> <span class="k">as</span> <span class="n">impressions</span><span class="p">,</span>
   <span class="n">CountEach</span><span class="p">(</span><span class="n">accepts</span><span class="p">.</span><span class="n">item_id</span><span class="p">)</span> <span class="k">as</span> <span class="n">accepts</span><span class="p">,</span>
   <span class="n">CountEach</span><span class="p">(</span><span class="n">rejects</span><span class="p">.</span><span class="n">item_id</span><span class="p">)</span> <span class="k">as</span> <span class="n">rejects</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>This produces three bags, consisting of <code>(item_id,count)</code>.  We can now join these bags
 together using <code>BagLeftOuterJoin</code>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">BagLeftOuterJoin</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagLeftOuterJoin</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">BagLeftOuterJoin</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">bags</span><span class="p">.</span><span class="n">BagLeftOuterJoin</span><span class="p">();</span>
 
 <span class="n">features_joined</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">features_counted</span> <span class="k">GENERATE</span>
   <span class="n">user_id</span><span class="p">,</span>
@@ -226,11 +244,12 @@ together using <code>BagLeftOuterJoin</c
     <span class="n">accepts</span><span class="p">,</span> <span class="s1">'item_id'</span><span class="p">,</span>
     <span class="n">rejects</span><span class="p">,</span> <span class="s1">'item_id'</span>
   <span class="p">)</span> <span class="k">as</span> <span class="n">items</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>We left join in the impression here since the user cannot accept or reject an item that was not seen.
 The left join can of course produce null values for accepts and rejects that did not occur, so let&#39;s
 clean those up by replacing null values with counts of zero:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">Coalesce</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">util</span><span class="p">.</span><span class="n">Coalesce</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">Coalesce</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">util</span><span class="p">.</span><span class="n">Coalesce</span><span class="p">();</span>
 
 <span class="n">features</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">features_joined</span> <span class="p">{</span>
   <span class="n">projected</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">items</span> <span class="k">GENERATE</span>
@@ -240,14 +259,16 @@ clean those up by replacing null values
     <span class="n">Coalesce</span><span class="p">(</span><span class="n">rejects</span><span class="p">::</span><span class="k">count</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> <span class="k">as</span> <span class="n">reject_count</span><span class="p">;</span>
   <span class="k">GENERATE</span> <span class="n">user_id</span><span class="p">,</span> <span class="n">projected</span> <span class="k">as</span> <span class="n">items</span><span class="p">;</span>
 <span class="p">}</span>
-</pre>
+</code></pre>
+
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/estimation.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/estimation.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/estimation.html (original)
+++ incubator/datafu/site/docs/datafu/guide/estimation.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Estimation - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_estimation">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,23 +67,23 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Estimation</h2>
+      <h2 id="estimation">Estimation</h2>
 
 <p>In some cases you don&#39;t need exact results.  Estimates may be sufficient if it results in more efficient
 execution.  With this in mind Apache DataFu has UDFs for computing estimates of certain quantities.</p>
 
-<h3 id="toc_1">Median and Quantiles</h3>
+<h3 id="median-and-quantiles">Median and Quantiles</h3>
 
 <p><a href="/docs/datafu/1.2.0/datafu/pig/stats/StreamingMedian.html">StreamingMedian</a> and
 <a href="/docs/datafu/1.2.0/datafu/pig/stats/StreamingQuantile.html">StreamingQuantile</a> can
@@ -87,23 +93,25 @@ the input bags to be sorted.  </p>
 
 <p>See <a href="/docs/datafu/guide/statistics.html">Statistics</a> for more details.</p>
 
-<h3 id="toc_2">Cardinality</h3>
+<h3 id="cardinality">Cardinality</h3>
 
 <p>The <a href="http://en.wikipedia.org/wiki/Cardinality">cardinality</a> of a data set is the number of distinct elements
 within it.  </p>
 
 <p>Suppose we have data consisting of member IDs and we want to know the count of distinct member IDs.
 We can do this in Pig as follows:</p>
-<pre class="highlight pig"><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">member_id</span><span class="p">;</span>
+<pre class="highlight pig"><code><span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data</span> <span class="k">GENERATE</span> <span class="n">member_id</span><span class="p">;</span>
 <span class="n">data</span> <span class="o">=</span> <span class="k">DISTINCT</span> <span class="n">data</span><span class="p">;</span>
 <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">data</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="k">SIZE</span><span class="p">(</span><span class="n">data</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>However, this requires two jobs.  Alternatively we can use the
 <a href="/docs/datafu/1.2.0/datafu/pig/stats/HyperLogLogPlusPlus.html">HyperLogLogPlusPlus</a> UDF
 to estimate the cardinality with good accuracy using only one job:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">HyperLogLogPlusPlus</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">HyperLogLogPlusPlus</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">HyperLogLogPlusPlus</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">stats</span><span class="p">.</span><span class="n">HyperLogLogPlusPlus</span><span class="p">();</span>
 <span class="n">data</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="p">(</span><span class="k">GROUP</span> <span class="n">data</span> <span class="k">ALL</span><span class="p">)</span> <span class="k">GENERATE</span> <span class="n">HyperLogLogPlusPlus</span><span class="p">(</span><span class="n">data</span><span class="p">);</span>
-</pre>
+</code></pre>
+
 <p>The <code>HyperLogLogPlusPlus</code> UDF uses an implementation of the
 <a href="http://static.googleusercontent.com/external_content/untrusted_dlcp/research.google.com/en/us/pubs/archive/40671.pdf">HyperLogLog++</a>
 algorithm provided by the <a href="https://github.com/addthis/stream-lib">stream-lib</a> library.  HyperLogLog++ itself is an enhancement to
@@ -114,8 +122,9 @@ the well known <a href="http://algo.inri
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/hashing.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/hashing.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/hashing.html (original)
+++ incubator/datafu/site/docs/datafu/guide/hashing.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Hashing - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_hashing">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,65 +67,70 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Hashing</h2>
+      <h2 id="hashing">Hashing</h2>
 
-<h3 id="toc_1">MD5</h3>
+<h3 id="md5">MD5</h3>
 
 <p>The <a href="http://en.wikipedia.org/wiki/MD5">MD5 hash</a> of a string can be computed with the 
 <a href="/docs/datafu/1.2.0/datafu/pig/hash/MD5.html">MD5</a>
 UDF.</p>
 
 <p>For example:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">MD5</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">MD5</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">MD5</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">MD5</span><span class="p">();</span>
 
-<span class="c1">--input: &quot;hello, world!&quot;
+<span class="c1">--input: "hello, world!"
 </span><span class="n">data_in</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">as</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">chararray</span><span class="p">);</span>
 <span class="n">data_out</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data_in</span> <span class="k">GENERATE</span> <span class="n">MD5</span><span class="p">(</span><span class="n">val</span><span class="p">)</span> <span class="k">as</span> <span class="n">val</span><span class="p">;</span>
 
 <span class="c1">-- produces: (fc3ff98e8c6a0d3087d515c0473f8677)
 </span><span class="k">DUMP</span> <span class="n">data_out</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>The function can instead output base64 by passing &#39;base64&#39; to the constructor.
 The default is &#39;hex&#39; for hexadecimal.</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">MD5</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">MD5</span><span class="p">(</span><span class="s1">'base64'</span><span class="p">);</span>
-</pre>
-<h3 id="toc_2">SHA</h3>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">MD5</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">MD5</span><span class="p">(</span><span class="s1">'base64'</span><span class="p">);</span>
+</code></pre>
+
+<h3 id="sha">SHA</h3>
 
 <p>A <a href="http://en.wikipedia.org/wiki/Secure_Hash_Algorithm">SHA</a> hash can be computed with
 <a href="/docs/datafu/1.2.0/datafu/pig/hash/SHA.html">SHA</a>.
 The output will be in hexadecimal.</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">SHA</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">SHA</span><span class="p">();</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">SHA</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">SHA</span><span class="p">();</span>
 
-<span class="c1">--input: &quot;hello, world!&quot;
+<span class="c1">--input: "hello, world!"
 </span><span class="n">data_in</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input'</span> <span class="k">as</span> <span class="p">(</span><span class="n">val</span><span class="p">:</span><span class="n">chararray</span><span class="p">);</span>
 <span class="n">data_out</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">data_in</span> <span class="k">GENERATE</span> <span class="n">SHA</span><span class="p">(</span><span class="n">val</span><span class="p">)</span> <span class="k">as</span> <span class="n">val</span><span class="p">;</span>
 
 <span class="c1">-- produces: (7509e5bda0c762d2bac7f90d758b5b2263fa01ccbc542ab5e3df163be08e6ca9)
 </span><span class="k">DUMP</span> <span class="n">data_out</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>By default this uses SHA-256.
 The constructor also takes an optional parameter for the particular SHA algorithm to use.
 To use SHA-512 instead:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">SHA512</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">SHA</span><span class="p">(</span><span class="s1">'512'</span><span class="p">);</span>
-</pre>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">SHA512</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">hash</span><span class="p">.</span><span class="n">SHA</span><span class="p">(</span><span class="s1">'512'</span><span class="p">);</span>
+</code></pre>
+
     </div>
   </div>
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>
 

Modified: incubator/datafu/site/docs/datafu/guide/link-analysis.html
URL: http://svn.apache.org/viewvc/incubator/datafu/site/docs/datafu/guide/link-analysis.html?rev=1709884&r1=1709883&r2=1709884&view=diff
==============================================================================
--- incubator/datafu/site/docs/datafu/guide/link-analysis.html (original)
+++ incubator/datafu/site/docs/datafu/guide/link-analysis.html Wed Oct 21 17:00:40 2015
@@ -1,3 +1,5 @@
+
+
 <!doctype html>
 <html>
   <head>
@@ -10,11 +12,9 @@
     <!-- Use title if it's in the page YAML frontmatter -->
     <title>Link Analysis - Guide - Apache DataFu Pig</title>
     
-    <link href="/stylesheets/all.css" media="screen" rel="stylesheet" type="text/css" />
-<link href="/stylesheets/highlight.css" media="screen" rel="stylesheet" type="text/css" />
-    <script src="/javascripts/all.js" type="text/javascript"></script>
+    <link href="/stylesheets/all.css" rel="stylesheet" /><link href="/stylesheets/highlight.css" rel="stylesheet" />
+    <script src="/javascripts/all.js"></script>
 
-    
     <script type="text/javascript">
       var _gaq = _gaq || [];
       _gaq.push(['_setAccount', 'UA-30533336-2']);
@@ -26,14 +26,14 @@
         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
       })();
     </script>
-    
   </head>
   
   <body class="docs docs_datafu docs_datafu_guide docs_datafu_guide_link-analysis">
 
     <div class="container">
 
-      <div class="header">
+      
+<div class="header">
 
   <ul class="nav nav-pills pull-right">
     <li><a href="/">Home</a></li>
@@ -48,12 +48,18 @@
       
   <div class="row">
     <div class="col-md-3">
-      <h4>Apache DataFu Pig</h4>
+      
+<h4>Apache DataFu</h4>
+<ul class="nav nav-pills nav-stacked">
+  <li><a href="/">Home</a></li>
+  <li><a href="/docs/quick-start.html">Quick Start</a></li>
+</ul>
+
+<h4>Apache DataFu Pig</h4>
 <ul class="nav nav-pills nav-stacked">
   <li><a href="/docs/datafu/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/datafu/guide.html">Guide</a></li>
   <li><a href="/docs/datafu/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/datafu/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Apache DataFu Hourglass</h4>
@@ -61,23 +67,23 @@
   <li><a href="/docs/hourglass/getting-started.html">Getting Started</a></li>
   <li><a href="/docs/hourglass/concepts.html">Concepts</a></li>
   <li><a href="/docs/hourglass/javadoc.html">Javadoc</a></li>
-  <li><a href="/docs/hourglass/contributing.html">Contributing</a></li>
 </ul>
 
 <h4>Community</h4>
 <ul class="nav nav-pills nav-stacked">
+  <li><a href="/community/contributing.html">Contributing</a></li>
   <li><a href="/community/mailing-lists.html">Mailing Lists</a></li>
   <li><a href="https://issues.apache.org/jira/browse/DATAFU">Bugs</a></li>
 </ul>
     </div>
     <div class="col-md-7">
       <h4 class="text-muted">Apache DataFu Pig - Guide</h4>
-      <h2 id="toc_0">Link Analysis</h2>
+      <h2 id="link-analysis">Link Analysis</h2>
 
-<h3 id="toc_1">PageRank</h3>
+<h3 id="pagerank">PageRank</h3>
 
 <p>Run PageRank on a large number of independent graphs through the <a href="/docs/datafu/1.2.0/datafu/pig/linkanalysis/PageRank.html">PageRank UDF</a>:</p>
-<pre class="highlight pig"><span class="k">define</span> <span class="n">PageRank</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">linkanalysis</span><span class="p">.</span><span class="n">PageRank</span><span class="p">(</span><span class="s1">'dangling_nodes'</span><span class="p">,</span><span class="s1">'true'</span><span class="p">);</span>
+<pre class="highlight pig"><code><span class="k">define</span> <span class="n">PageRank</span> <span class="n">datafu</span><span class="p">.</span><span class="n">pig</span><span class="p">.</span><span class="n">linkanalysis</span><span class="p">.</span><span class="n">PageRank</span><span class="p">(</span><span class="s1">'dangling_nodes'</span><span class="p">,</span><span class="s1">'true'</span><span class="p">);</span>
 
 <span class="n">topic_edges</span> <span class="o">=</span> <span class="k">LOAD</span> <span class="s1">'input_edges'</span> <span class="k">as</span> <span class="p">(</span><span class="n">topic</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">source</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">dest</span><span class="p">:</span><span class="n">INT</span><span class="p">,</span><span class="n">weight</span><span class="p">:</span><span class="n">DOUBLE</span><span class="p">);</span>
 
@@ -95,7 +101,8 @@
 
 <span class="n">skill_ranks</span> <span class="o">=</span> <span class="k">FOREACH</span> <span class="n">skill_ranks</span> <span class="k">GENERATE</span>
   <span class="n">topic</span><span class="p">,</span> <span class="n">source</span><span class="p">,</span> <span class="n">pr</span><span class="p">;</span>
-</pre>
+</code></pre>
+
 <p>This implementation stores the nodes and edges (mostly) in memory. It is therefore best suited when one needs to compute PageRank on many reasonably sized graphs in parallel.</p>
 
     </div>
@@ -103,8 +110,9 @@
 
 
     
-      <div class="footer">
-Copyright &copy; 2011-2014 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
+      
+<div class="footer">
+Copyright &copy; 2011-2015 <a href="http://www.apache.org/licenses/">The Apache Software Foundation</a>. <br>
 Apache DataFu, DataFu, Apache Pig, Apache Hadoop, Hadoop, Apache, and the Apache feather logo are either registered trademarks or trademarks of the Apache Software Foundation in the United States and other countries.
 </div>