You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by bi...@apache.org on 2013/04/02 05:32:42 UTC

svn commit: r1463367 [9/11] - in /pig/site: author/src/documentation/content/xdocs/ publish/ publish/docs/r0.11.1/ publish/docs/r0.11.1/images/ publish/docs/r0.11.1/skin/ publish/docs/r0.11.1/skin/css/ publish/docs/r0.11.1/skin/images/ publish/docs/r0....

Added: pig/site/publish/docs/r0.11.1/start.html
URL: http://svn.apache.org/viewvc/pig/site/publish/docs/r0.11.1/start.html?rev=1463367&view=auto
==============================================================================
--- pig/site/publish/docs/r0.11.1/start.html (added)
+++ pig/site/publish/docs/r0.11.1/start.html Tue Apr  2 03:32:39 2013
@@ -0,0 +1,1482 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.9">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Getting Started</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/pig/">Pig</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://hadoop.apache.org/pig/"><img class="logoImage" alt="Pig" src="images/pig-logo.gif" title="A platform for analyzing large datasets."></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li>
+<a class="unselected" href="http://hadoop.apache.org/pig/">Project</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/pig/">Wiki</a>
+</li>
+<li class="current">
+<a class="selected" href="index.html">Pig 0.11.1 Documentation</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Pig</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menuitem">
+<a href="index.html">Overview</a>
+</div>
+<div class="menupage">
+<div class="menupagetitle">Getting Started</div>
+</div>
+<div class="menuitem">
+<a href="basic.html">Pig Latin Basics</a>
+</div>
+<div class="menuitem">
+<a href="func.html">Built In Functions</a>
+</div>
+<div class="menuitem">
+<a href="udf.html">User Defined Functions</a>
+</div>
+<div class="menuitem">
+<a href="cont.html">Control Structures</a>
+</div>
+<div class="menuitem">
+<a href="cmds.html">Shell and Utililty Commands</a>
+</div>
+<div class="menuitem">
+<a href="perf.html">Performance and Efficiency</a>
+</div>
+<div class="menuitem">
+<a href="test.html">Testing and Diagnostics</a>
+</div>
+<div class="menuitem">
+<a href="pig-index.html">Index</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Miscellaneous</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="api/">API Docs</a>
+</div>
+<div class="menuitem">
+<a href="jdiff/changes.html">API Changes</a>
+</div>
+<div class="menuitem">
+<a href="https://cwiki.apache.org/confluence/display/PIG">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="https://cwiki.apache.org/confluence/display/PIG/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://hadoop.apache.org/pig/releases.html">Release Notes</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="start.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Getting Started</h1>
+<div id="front-matter">
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Pig+Setup">Pig Setup</a>
+<ul class="minitoc">
+<li>
+<a href="#req">Requirements</a>
+</li>
+<li>
+<a href="#download">Download Pig</a>
+</li>
+<li>
+<a href="#build">Build Pig</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#run">Running Pig </a>
+<ul class="minitoc">
+<li>
+<a href="#execution-modes">Execution Modes</a>
+</li>
+<li>
+<a href="#interactive-mode">Interactive Mode</a>
+</li>
+<li>
+<a href="#batch-mode">Batch Mode</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#pl-statements">Pig Latin Statements</a>
+<ul class="minitoc">
+<li>
+<a href="#data-load">Loading Data</a>
+</li>
+<li>
+<a href="#data-work-with">Working with Data</a>
+</li>
+<li>
+<a href="#data-store">Storing Intermediate Results</a>
+</li>
+<li>
+<a href="#data-results">Storing Final Results</a>
+</li>
+<li>
+<a href="#debug">Debugging Pig Latin</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#properties">Pig Properties</a>
+</li>
+<li>
+<a href="#tutorial">Pig Tutorial </a>
+<ul class="minitoc">
+<li>
+<a href="#Running+the+Pig+Scripts+in+Local+Mode"> Running the Pig Scripts in Local Mode</a>
+</li>
+<li>
+<a href="#Running+the+Pig+Scripts+in+Mapreduce+Mode"> Running the Pig Scripts in Mapreduce Mode</a>
+</li>
+<li>
+<a href="#Pig+Tutorial+Files"> Pig Tutorial Files</a>
+</li>
+<li>
+<a href="#pig-script-1"> Pig Script 1: Query Phrase Popularity</a>
+</li>
+<li>
+<a href="#pig-script-2">Pig Script 2: Temporal Query Phrase Popularity</a>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+</div>
+  
+<!-- ========================================================== -->  
+
+<!-- SET UP PIG -->
+ 
+<a name="Pig+Setup"></a>
+<h2 class="h3">Pig Setup</h2>
+<div class="section">
+<a name="req"></a>
+<h3 class="h4">Requirements</h3>
+<p>
+<strong>Mandatory</strong>
+</p>
+<p>Unix and Windows users need the following:</p>
+<ul>
+		  
+<li> 
+<strong>Hadoop 0.20.2, 020.203, 020.204,  0.20.205, 1.0.0, 1.0.1, or 0.23.0, 0.23.1</strong> - <a href="http://hadoop.apache.org/common/releases.html">http://hadoop.apache.org/common/releases.html</a> (You can run Pig with different versions of Hadoop by setting HADOOP_HOME to point to the directory where you have installed Hadoop. If you do not set HADOOP_HOME, by default Pig will run with the embedded version, currently Hadoop 1.0.0.)</li>
+		  
+<li> 
+<strong>Java 1.6</strong> - <a href="http://java.sun.com/javase/downloads/index.jsp">http://java.sun.com/javase/downloads/index.jsp</a> (set JAVA_HOME to the root of your Java installation)</li>	
+		
+</ul>
+<p></p>
+<p>Windows users also need to install Cygwin and the Perl package: <a href="http://www.cygwin.com/"> http://www.cygwin.com/</a>
+</p>
+<p></p>
+<p>
+<strong>Optional</strong>
+</p>
+<ul>
+          
+<li> 
+<strong>Python 2.5</strong> - <a href="http://jython.org/downloads.html">http://jython.org/downloads.html</a> (when using Python UDFs or embedding Pig in Python) </li>
+          
+<li> 
+<strong>JavaScript 1.7</strong> - <a href="https://developer.mozilla.org/en/Rhino_downloads_archive">https://developer.mozilla.org/en/Rhino_downloads_archive</a> and <a href="http://mirrors.ibiblio.org/pub/mirrors/maven2/rhino/js/">http://mirrors.ibiblio.org/pub/mirrors/maven2/rhino/js/</a>  (when using JavaScript UDFs or embedding Pig in JavaScript) </li>		  
+          
+<li> 
+<strong>JRuby 1.6.7</strong> - <a href="http://www.jruby.org/download">http://www.jruby.org/download</a> (when using JRuby UDFs) </li>
+          
+<li> 
+<strong>Groovy (<em>groovy-all</em>) 1.8.6</strong> - <a href="http://groovy.codehaus.org/Download">http://groovy.codehaus.org/Download</a> or directly on a maven repo <a href="http://mirrors.ibiblio.org/pub/mirrors/maven2/org/codehaus/groovy/groovy-all/1.8.6/">http://mirrors.ibiblio.org/pub/mirrors/maven2/org/codehaus/groovy/groovy-all/1.8.6/</a> (when using Groovy UDFs or embedding Pig in Groovy) </li>
+		  
+<li> 
+<strong>Ant 1.7</strong> - <a href="http://ant.apache.org/">http://ant.apache.org/</a> (for builds) </li>
+		  
+<li> 
+<strong>JUnit 4.5</strong> - <a href="http://junit.sourceforge.net/">http://junit.sourceforge.net/</a> (for unit tests) </li>
+		
+</ul>
+<a name="download"></a>
+<h3 class="h4">Download Pig</h3>
+<p>To get a Pig distribution, do the following:</p>
+<ol>
+	
+<li>Download a recent stable release from one of the Apache Download Mirrors 
+	(see <a href="http://hadoop.apache.org/pig/releases.html"> Pig Releases</a>).</li>
+	
+    
+<li>Unpack the downloaded Pig distribution, and then note the following:
+	    <ul>
+	    
+<li>The Pig script file, pig, is located in the bin directory (/pig-n.n.n/bin/pig). 
+	    The Pig environment variables are described in the Pig script file.</li>
+	    
+<li>The Pig properties file, pig.properties, is located in the conf directory (/pig-n.n.n/conf/pig.properties). 
+	    You can specify an alternate location using the PIG_CONF_DIR environment variable.</li>
+	
+</ul>	
+	
+</li>
+	
+<li>Add /pig-n.n.n/bin to your path. Use export (bash,sh,ksh) or setenv (tcsh,csh). For example: <br>
+	
+<span class="codefrag">$ export PATH=/&lt;my-path-to-pig&gt;/pig-n.n.n/bin:$PATH</span>
+
+</li>
+
+<li>
+Test the Pig installation with this simple command: <span class="codefrag">$ pig -help</span>
+
+</li>
+
+</ol>
+<a name="build"></a>
+<h3 class="h4">Build Pig</h3>
+<p>To build pig, do the following:</p>
+<ol>
+	  
+<li> Check out the Pig code from SVN: <span class="codefrag">svn co http://svn.apache.org/repos/asf/pig/trunk</span> 
+</li>
+	  
+<li> Build the code from the top directory: <span class="codefrag">ant</span> 
+<br>
+	  If the build is successful, you should see the pig.jar file created in that directory. </li>	
+	  
+<li> Validate the pig.jar  by running a unit test: <span class="codefrag">ant test</span>
+</li>
+     
+</ol>
+</div>
+
+  <!-- ==================================================================== -->
+    
+   <!-- RUNNING PIG  -->
+   
+<a name="run"></a>
+<h2 class="h3">Running Pig </h2>
+<div class="section">
+<p>You can run Pig (execute Pig Latin statements and Pig commands) using various modes.</p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+	
+<tr>
+	
+<td colspan="1" rowspan="1"></td>
+    <td colspan="1" rowspan="1"><strong>Local Mode</strong></td>
+    <td colspan="1" rowspan="1"><strong>Mapreduce Mode</strong></td>
+	
+</tr>
+	
+<tr>
+	
+<td colspan="1" rowspan="1"><strong>Interactive Mode </strong></td>
+    <td colspan="1" rowspan="1">yes</td>
+    <td colspan="1" rowspan="1">yes</td>
+	
+</tr>
+	
+<tr>
+	
+<td colspan="1" rowspan="1"><strong>Batch Mode</strong> </td>
+    <td colspan="1" rowspan="1">yes</td>
+    <td colspan="1" rowspan="1">yes</td>
+	
+</tr>
+	
+</table>
+<a name="execution-modes"></a>
+<h3 class="h4">Execution Modes</h3>
+<p>Pig has two execution modes or exectypes: </p>
+<ul>
+
+<li>
+<strong>Local Mode</strong> - To run Pig in local mode, you need access to a single machine; all files are installed and run using your local host and file system. Specify local mode using the -x flag (pig -x local). Note that local mode does not support parallel mapper execution with Hadoop 0.20.x and 1.0.0. This is because the LocalJobRunner of these Hadoop versions is not thread-safe.
+</li>
+
+<li>
+<strong>Mapreduce Mode</strong> - To run Pig in mapreduce mode, you need access to a Hadoop cluster and HDFS installation. Mapreduce mode is the default mode; you can, <em>but don't need to</em>, specify it using the -x flag (pig OR pig -x mapreduce).
+</li>
+
+</ul>
+<p></p>
+<p>You can run Pig in either mode using the "pig" command (the bin/pig Perl script) or the "java" command (java -cp pig.jar ...).
+</p>
+<a name="Examples"></a>
+<h4>Examples</h4>
+<p>This example shows how to run Pig in local and mapreduce mode using the pig command.</p>
+<pre class="code">
+/* local mode */
+$ pig -x local ...
+ 
+ 
+/* mapreduce mode */
+$ pig ...
+or
+$ pig -x mapreduce ...
+</pre>
+<p>This example shows how to run Pig in local and mapreduce mode using the java command.</p>
+<pre class="code">
+/* local mode */
+$ java -cp pig.jar org.apache.pig.Main -x local ...
+
+
+/* mapreduce mode */
+$ java -cp pig.jar org.apache.pig.Main ...
+or
+$ java -cp pig.jar org.apache.pig.Main -x mapreduce ...
+</pre>
+<a name="interactive-mode"></a>
+<h3 class="h4">Interactive Mode</h3>
+<p>You can run Pig in interactive mode using the Grunt shell. Invoke the Grunt shell using the "pig" command (as shown below) and then enter your Pig Latin statements and Pig commands interactively at the command line.
+</p>
+<a name="Example"></a>
+<h4>Example</h4>
+<p>These Pig Latin statements extract all user IDs from the /etc/passwd file. First, copy the /etc/passwd file to your local working directory. Next, invoke the Grunt shell by typing the "pig" command (in local or hadoop mode). Then, enter the Pig Latin statements interactively at the grunt prompt (be sure to include the semicolon after each statement). The DUMP operator will display the results to your terminal screen.</p>
+<pre class="code">
+grunt&gt; A = load 'passwd' using PigStorage(':'); 
+grunt&gt; B = foreach A generate $0 as id; 
+grunt&gt; dump B; 
+</pre>
+<p>
+<strong>Local Mode</strong>
+</p>
+<pre class="code">
+$ pig -x local
+... - Connecting to ...
+grunt&gt; 
+</pre>
+<p>
+<strong>Mapreduce Mode</strong> 
+</p>
+<pre class="code">
+$ pig -x mapreduce
+... - Connecting to ...
+grunt&gt; 
+
+or
+
+$ pig 
+... - Connecting to ...
+grunt&gt; 
+</pre>
+<a name="batch-mode"></a>
+<h3 class="h4">Batch Mode</h3>
+<p>You can run Pig in batch mode using <a href="#pig-scripts">Pig scripts</a> and the "pig" command (in local or hadoop mode).</p>
+<a name="Example-N101A4"></a>
+<h4>Example</h4>
+<p>The Pig Latin statements in the Pig script (id.pig) extract all user IDs from the /etc/passwd file. First, copy the /etc/passwd file to your local working directory. Next, run the Pig script from the command line (using local or mapreduce mode). The STORE operator will write the results to a file (id.out).</p>
+<pre class="code">
+/* id.pig */
+
+A = load 'passwd' using PigStorage(':');  -- load the passwd file 
+B = foreach A generate $0 as id;  -- extract the user IDs 
+store B into &lsquo;id.out&rsquo;;  -- write the results to a file name id.out
+</pre>
+<p>
+<strong>Local Mode</strong>
+</p>
+<pre class="code">
+$ pig -x local id.pig
+</pre>
+<p>
+<strong>Mapreduce Mode</strong> 
+</p>
+<pre class="code">
+$ pig id.pig
+or
+$ pig -x mapreduce id.pig
+</pre>
+<a name="pig-scripts"></a>
+<h4>Pig Scripts</h4>
+<p>Use Pig scripts to place Pig Latin statements and Pig commands in a single file. While not required, it is good practice to identify the file using the *.pig extension.</p>
+<p>You can run Pig scripts from the command line and from the Grunt shell
+(see the <a href="cmds.html#run">run</a> and <a href="cmds.html#exec">exec</a> commands). </p>
+<p>Pig scripts allow you to pass values to parameters using <a href="cont.html#Parameter-Sub">parameter substitution</a>. </p>
+<a name="comments"></a>
+<p id="comments">
+<strong>Comments in Scripts</strong>
+</p>
+<p>You can include comments in Pig scripts:</p>
+<ul>
+      
+<li>
+         
+<p>For multi-line comments use /* &hellip;. */</p>
+      
+</li>
+      
+<li>
+         
+<p>For single-line comments use --</p>
+      
+</li>
+   
+</ul>
+<pre class="code">
+/* myscript.pig
+My script is simple.
+It includes three Pig Latin statements.
+*/
+
+A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float); -- loading data
+B = FOREACH A GENERATE name;  -- transforming data
+DUMP B;  -- retrieving results
+</pre>
+<a name="dfs"></a>
+<p id="dfs">
+<strong>Scripts and Distributed File Systems</strong>
+</p>
+<p>Pig supports running scripts (and Jar files) that are stored in HDFS, Amazon S3, and other distributed file systems. The script's full location URI is required (see <a href="basic.html#REGISTER">REGISTER</a> for information about Jar files). For example, to run a Pig script on HDFS, do the following:</p>
+<pre class="code">
+$ pig hdfs://nn.mydomain.com:9020/myscripts/script.pig
+</pre>
+</div>
+
+  <!-- ==================================================================== -->
+    
+   <!-- PIG LATIN STATEMENTS -->
+   
+<a name="pl-statements"></a>
+<h2 class="h3">Pig Latin Statements</h2>
+<div class="section">
+<p>Pig Latin statements are the basic constructs you use to process data using Pig. 
+   A Pig Latin statement is an operator that takes a <a href="basic.html#relations">relation</a> as input and produces another relation as output. 
+   (This definition applies to all Pig Latin operators except LOAD and STORE which read data from and write data to the file system.) 
+   Pig Latin statements may include <a href="basic.html#Expressions">expressions</a> and <a href="basic.html#Schemas">schemas</a>. 
+   Pig Latin statements can span multiple lines and must end with a semi-colon ( ; ).  
+   By default, Pig Latin statements are processed using <a href="perf.html#multi-query-execution">multi-query execution</a>.  
+ </p>
+<p>Pig Latin statements are generally organized as follows:</p>
+<ul>
+      
+<li>
+         
+<p>A LOAD statement to read data from the file system. </p>
+      
+</li>
+      
+<li>
+         
+<p>A series of "transformation" statements to process the data. </p>
+      
+</li>
+      
+<li>
+         
+<p>A DUMP statement to view results or a STORE statement to save the results.</p>
+      
+</li>
+   
+</ul>
+<p></p>
+<p>Note that a DUMP or STORE statement is required to generate output.</p>
+<ul>
+
+<li>
+
+<p>In this example Pig will validate, but not execute, the LOAD and FOREACH statements.</p>
+
+<pre class="code">
+A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
+B = FOREACH A GENERATE name;
+</pre> 
+
+</li>
+
+<li>
+
+<p>In this example, Pig will validate and then execute the LOAD, FOREACH, and DUMP statements.</p>
+
+<pre class="code">
+A = LOAD 'student' USING PigStorage() AS (name:chararray, age:int, gpa:float);
+B = FOREACH A GENERATE name;
+DUMP B;
+(John)
+(Mary)
+(Bill)
+(Joe)
+</pre>
+
+</li>
+
+</ul>
+<a name="data-load"></a>
+<h3 class="h4">Loading Data</h3>
+<p>Use the  <a href="basic.html#LOAD">LOAD</a> operator and the <a href="udf.html#load-store-functions">load/store functions</a> to read data into Pig (PigStorage is the default load function).</p>
+<a name="data-work-with"></a>
+<h3 class="h4">Working with Data</h3>
+<p>Pig allows you to transform data in many ways. As a starting point, become familiar with these operators:</p>
+<ul>
+      
+<li>
+         
+<p>Use the <a href="basic.html#FILTER">FILTER</a> operator to work with tuples or rows of data. 
+         Use the <a href="basic.html#FOREACH">FOREACH</a> operator to work with columns of data.</p>
+      
+</li>
+      
+<li>
+         
+<p>Use the <a href="basic.html#GROUP ">GROUP</a> operator to group data in a single relation. 
+         Use the <a href="basic.html#COGROUP ">COGROUP</a>,
+         <a href="basic.html#join-inner">inner JOIN</a>, and
+         <a href="basic.html#join-outer">outer JOIN</a>
+         operators  to group or join data in two or more relations.</p>
+      
+</li>
+      
+<li>
+         
+<p>Use the <a href="basic.html#UNION">UNION</a> operator to merge the contents of two or more relations. 
+         Use the <a href="basic.html#SPLIT">SPLIT</a> operator to partition the contents of a relation into multiple relations.</p>
+      
+</li>
+   
+</ul>
+<a name="data-store"></a>
+<h3 class="h4">Storing Intermediate Results</h3>
+<p>Pig stores the intermediate data generated between MapReduce jobs in a temporary location on HDFS. 
+   This location must already exist on HDFS prior to use. 
+   This location can be configured using the pig.temp.dir property. The property's default value is "/tmp" which is the same 
+   as the hardcoded location in Pig 0.7.0 and earlier versions. </p>
+<a name="data-results"></a>
+<h3 class="h4">Storing Final Results</h3>
+<p>Use the  <a href="basic.html#STORE">STORE</a> operator and the <a href="udf.html#load-store-functions">load/store functions</a> 
+   to write results to the file system (PigStorage is the default store function). </p>
+<p>
+<strong>Note:</strong> During the testing/debugging phase of your implementation, you can use DUMP to display results to your terminal screen. 
+However, in a production environment you always want to use the STORE operator to save your results (see <a href="perf.html#Store-Dump">Store vs. Dump</a>).</p>
+<a name="debug"></a>
+<h3 class="h4">Debugging Pig Latin</h3>
+<p>Pig Latin provides operators that can help you debug your Pig Latin statements:</p>
+<ul>
+      
+<li>
+         
+<p>Use the  <a href="test.html#DUMP">DUMP</a> operator to display results to your terminal screen. </p>
+      
+</li>
+      
+<li>
+         
+<p>Use the  <a href="test.html#DESCRIBE">DESCRIBE</a> operator to review the schema of a relation.</p>
+      
+</li>
+      
+<li>
+         
+<p>Use the  <a href="test.html#EXPLAIN">EXPLAIN</a> operator to view the logical, physical, or map reduce execution plans to compute a relation.</p>
+      
+</li>
+      
+<li>
+         
+<p>Use the  <a href="test.html#ILLUSTRATE">ILLUSTRATE</a> operator to view the step-by-step execution of a series of statements.</p>
+      
+</li>
+   
+</ul>
+</div>  
+
+
+<!-- ================================================================== -->
+<!-- PIG PROPERTIES -->
+
+<a name="properties"></a>
+<h2 class="h3">Pig Properties</h2>
+<div class="section">
+<p>Pig supports a number of Java properties that you can use to customize Pig behavior. You can retrieve a list of the properties using the <a href="cmds.html#help">help properties</a> command. All of these properties are optional; none are required. </p>
+<p></p>
+<a name="pig-properties"></a>
+<p id="pig-properties">To specify Pig properties use one of these mechanisms:</p>
+<ul>
+	
+<li>The pig.properties file (add the directory that contains the pig.properties file to the classpath)</li>
+	
+<li>The -D command line option and a Pig property (pig -Dpig.tmpfilecompression=true)</li>
+	
+<li>The -P command line option and a properties file (pig -P mypig.properties)</li>
+	
+<li>The <a href="cmds.html#set">set</a> command (set pig.exec.nocombiner true)</li>
+
+</ul>
+<p>
+<strong>Note:</strong> The properties file uses standard Java property file format.</p>
+<p>The following precedence order is supported: pig.properties &lt; -D Pig property &lt; -P properties file &lt; set command. This means that if the same property is provided using the &ndash;D command line option as well as the &ndash;P command line option (properties file), the value of the property in the properties file will take precedence.</p>
+<a name="hadoop-properties"></a>
+<p id="hadoop-properties">To specify Hadoop properties you can use the same mechanisms:</p>
+<ul>
+	
+<li>Hadoop configuration files (include pig-cluster-hadoop-site.xml)</li>
+	
+<li>The -D command line option and a Hadoop property (pig &ndash;Dmapreduce.task.profile=true) </li>
+	
+<li>The -P command line option and a property file (pig -P property_file)</li>
+	
+<li>The <a href="cmds.html#set">set</a> command (set mapred.map.tasks.speculative.execution false)</li>
+
+</ul>
+<p></p>
+<p>The same precedence holds: Hadoop configuration files &lt; -D Hadoop property &lt; -P properties_file &lt; set command.</p>
+<p>Hadoop properties are not interpreted by Pig but are passed directly to Hadoop. Any Hadoop property can be passed this way. </p>
+<p>All properties that Pig collects, including Hadoop properties, are available to any UDF via the UDFContext object. To get access to the properties, you can call the getJobConf method.</p>
+</div>  
+
+
+  <!-- ==================================================================== -->
+  <!-- PIG TUTORIAL -->
+  
+<a name="tutorial"></a>
+<h2 class="h3">Pig Tutorial </h2>
+<div class="section">
+<p>The Pig tutorial shows you how to run Pig scripts using Pig's local mode and mapreduce mode (see <a href="#execution-modes">Execution Modes</a>).</p>
+<p>To get started, do the following preliminary tasks:</p>
+<ol>
+
+<li>Make sure the JAVA_HOME environment variable is set the root of your Java installation.</li>
+
+<li>Make sure your PATH includes bin/pig (this enables you to run the tutorials using the "pig" command). 
+<pre class="code">
+$ export PATH=/&lt;my-path-to-pig&gt;/pig-0.9.0/bin:$PATH 
+</pre>
+
+</li>
+
+<li>Set the PIG_HOME environment variable:
+<pre class="code">
+$ export PIG_HOME=/&lt;my-path-to-pig&gt;/pig-0.9.0 
+</pre>
+</li>
+
+<li>Create the pigtutorial.tar.gz file:
+<ul>
+    
+<li>Move to the Pig tutorial directory (.../pig-0.9.0/tutorial).</li>
+	
+<li>Edit the build.xml file in the tutorial directory. 
+<pre class="code">
+Change this:   &lt;property name="pigjar" value="../pig.jar" /&gt;
+To this:       &lt;property name="pigjar" value="../pig-0.9.0-core.jar" /&gt;
+</pre>
+	
+</li>
+	
+<li>Run the "ant" command from the tutorial directory. This will create the pigtutorial.tar.gz file.
+	</li>
+
+</ul>
+
+
+</li>
+
+<li>Copy the pigtutorial.tar.gz file from the Pig tutorial directory to your local directory. </li>
+
+<li>Unzip the pigtutorial.tar.gz file.
+<pre class="code">
+$ tar -xzf pigtutorial.tar.gz
+</pre>
+
+</li>
+
+<li>A new directory named pigtmp is created. This directory contains the <a href="#Pig+Tutorial+Files">Pig Tutorial Files</a>. These files work with Hadoop 0.20.2 and include everything you need to run <a href="#pig-script-1">Pig Script 1</a> and <a href="#pig-script-2">Pig Script 2</a>.</li>
+
+</ol>
+<a name="Running+the+Pig+Scripts+in+Local+Mode"></a>
+<h3 class="h4"> Running the Pig Scripts in Local Mode</h3>
+<p>To run the Pig scripts in local mode, do the following: </p>
+<ol>
+
+<li>Move to the pigtmp directory.</li>
+
+<li>Execute the following command (using either script1-local.pig or script2-local.pig). 
+<pre class="code">
+$ pig -x local script1-local.pig
+</pre>
+
+</li>
+
+<li>Review the result files, located in the part-r-00000 directory.
+<p>The output may contain a few Hadoop warnings which can be ignored:</p>
+
+<pre class="code">
+2010-04-08 12:55:33,642 [main] INFO  org.apache.hadoop.metrics.jvm.JvmMetrics 
+- Cannot initialize JVM Metrics with processName=JobTracker, sessionId= - already initialized
+</pre>
+
+</li>
+
+</ol>
+<a name="Running+the+Pig+Scripts+in+Mapreduce+Mode"></a>
+<h3 class="h4"> Running the Pig Scripts in Mapreduce Mode</h3>
+<p>To run the Pig scripts in mapreduce mode, do the following: </p>
+<ol>
+
+<li>Move to the pigtmp directory.</li>
+
+<li>Copy the excite.log.bz2 file from the pigtmp directory to the HDFS directory.
+<pre class="code">
+$ hadoop fs &ndash;copyFromLocal excite.log.bz2 .
+</pre>
+
+</li>
+
+
+<li>Set the PIG_CLASSPATH environment variable to the location of the cluster configuration directory (the directory that contains the core-site.xml, hdfs-site.xml and mapred-site.xml files):
+<pre class="code">
+export PIG_CLASSPATH=/mycluster/conf
+</pre>
+</li>
+
+<li>Set the HADOOP_CONF_DIR environment variable to the location of the cluster configuration directory:
+<pre class="code">
+export HADOOP_CONF_DIR=/mycluster/conf
+</pre>
+</li>
+
+
+<li>Execute the following command (using either script1-hadoop.pig or script2-hadoop.pig):
+<pre class="code">
+$ pig script1-hadoop.pig
+</pre>
+
+</li>
+
+
+<li>Review the result files, located in the script1-hadoop-results or script2-hadoop-results HDFS directory:
+<pre class="code">
+$ hadoop fs -ls script1-hadoop-results
+$ hadoop fs -cat 'script1-hadoop-results/*' | less
+</pre>
+
+</li>
+
+</ol>
+<a name="Pig+Tutorial+Files"></a>
+<h3 class="h4"> Pig Tutorial Files</h3>
+<p>The contents of the Pig tutorial file (pigtutorial.tar.gz) are described here. </p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> 
+<strong>File</strong> 
+</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> 
+<strong>Description</strong>
+</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> pig.jar </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Pig JAR file </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> tutorial.jar </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> User defined functions (UDFs) and Java classes </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> script1-local.pig </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Pig Script 1, Query Phrase Popularity (local mode) </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> script1-hadoop.pig </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Pig Script 1, Query Phrase Popularity (mapreduce mode) </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> script2-local.pig </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Pig Script 2, Temporal Query Phrase Popularity (local mode)</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> script2-hadoop.pig </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Pig Script 2, Temporal Query Phrase Popularity (mapreduce mode) </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> excite-small.log </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Log file, Excite search engine (local mode) </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> excite.log.bz2 </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Log file, Excite search engine (mapreduce) </p>
+
+</td>
+
+</tr>
+
+</table>
+<p>The user defined functions (UDFs) are described here. </p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> 
+<strong>UDF</strong> 
+</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> 
+<strong>Description</strong>
+</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> ExtractHour </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Extracts the hour from the record.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> NGramGenerator </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Composes n-grams from the set of words. </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> NonURLDetector </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Removes the record if the query field is empty or a URL. </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> ScoreGenerator </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Calculates a "popularity" score for the n-gram.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> ToLower </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Changes the query field to lowercase. </p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> TutorialUtil </p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> Divides the query string into a set of words.</p>
+
+</td>
+
+</tr>
+
+</table>
+<a name="pig-script-1"></a>
+<h3 class="h4"> Pig Script 1: Query Phrase Popularity</h3>
+<p>The Query Phrase Popularity script (script1-local.pig or script1-hadoop.pig) processes a search query log file from the Excite search engine and finds search phrases that occur with particular high frequency during certain times of the day. </p>
+<p>The script is shown here: </p>
+<ul>
+
+<li>
+<p> Register the tutorial JAR file so that the included UDFs can be called in the script. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+REGISTER ./tutorial.jar; 
+</pre>
+<ul>
+
+<li>
+<p> Use the PigStorage function to load the excite log file (excite.log or excite-small.log) into the &ldquo;raw&rdquo; bag as an array of records with the fields <strong>user</strong>, <strong>time</strong>, and <strong>query</strong>.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+raw = LOAD 'excite.log' USING PigStorage('\t') AS (user, time, query);
+</pre>
+<ul>
+
+<li>
+<p> Call the NonURLDetector UDF to remove records if the query field is empty or a URL.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+</pre>
+<ul>
+
+<li>
+<p> Call the ToLower UDF to change the query field to lowercase.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
+</pre>
+<ul>
+
+<li>
+<p> Because the log file only contains queries for a single day, we are only interested in the hour. The excite query log timestamp format is YYMMDDHHMMSS. Call the ExtractHour UDF to extract the hour (HH) from the time field. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+</pre>
+<ul>
+
+<li>
+<p> Call the NGramGenerator UDF to compose the n-grams of the query. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+</pre>
+<ul>
+
+<li>
+<p> Use the DISTINCT operator to get the unique n-grams for all records.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+ngramed2 = DISTINCT ngramed1;
+</pre>
+<ul>
+
+<li>
+<p> Use the GROUP operator to group records by n-gram and hour. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+</pre>
+<ul>
+
+<li>
+<p> Use the COUNT function to get the count (occurrences) of each n-gram.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
+</pre>
+<ul>
+
+<li>
+<p> Use the GROUP operator to group records by n-gram only. Each group now corresponds to a distinct n-gram and has the count for each hour. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+uniq_frequency1 = GROUP hour_frequency2 BY group::ngram;
+</pre>
+<ul>
+
+<li>
+<p> For each group, identify the hour in which this n-gram is used with a particularly high frequency. Call the ScoreGenerator UDF to calculate a "popularity" score for the n-gram. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+uniq_frequency2 = FOREACH uniq_frequency1 GENERATE flatten($0), flatten(org.apache.pig.tutorial.ScoreGenerator($1));
+</pre>
+<ul>
+
+<li>
+<p> Use the FOREACH-GENERATE operator to assign names to the fields.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+uniq_frequency3 = FOREACH uniq_frequency2 GENERATE $1 as hour, $0 as ngram, $2 as score, $3 as count, $4 as mean;
+</pre>
+<ul>
+
+<li>
+<p> Use the FILTER operator to remove all records with a score less than or equal to 2.0. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+filtered_uniq_frequency = FILTER uniq_frequency3 BY score &gt; 2.0;
+</pre>
+<ul>
+
+<li>
+<p> Use the ORDER operator to sort the remaining records by hour and score. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+ordered_uniq_frequency = ORDER filtered_uniq_frequency BY hour, score;
+</pre>
+<ul>
+
+<li>
+<p> Use the PigStorage function to store the results. The output file contains a list of n-grams with the following fields: <strong>hour</strong>, <strong>ngram</strong>, <strong>score</strong>, <strong>count</strong>, <strong>mean</strong>. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+STORE ordered_uniq_frequency INTO '/tmp/tutorial-results' USING PigStorage(); 
+</pre>
+<a name="pig-script-2"></a>
+<h3 class="h4">Pig Script 2: Temporal Query Phrase Popularity</h3>
+<p>The Temporal Query Phrase Popularity script (script2-local.pig or script2-hadoop.pig) processes a search query log file from the Excite search engine and compares the occurrence of frequency of search phrases across two time periods separated by twelve hours. </p>
+<p>The script is shown here: </p>
+<ul>
+
+<li>
+<p> Register the tutorial JAR file so that the user defined functions (UDFs) can be called in the script. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+REGISTER ./tutorial.jar;
+</pre>
+<ul>
+
+<li>
+<p> Use the PigStorage function to load the excite log file (excite.log or excite-small.log) into the &ldquo;raw&rdquo; bag as an array of records with the fields <strong>user</strong>, <strong>time</strong>, and <strong>query</strong>. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+raw = LOAD 'excite.log' USING PigStorage('\t') AS (user, time, query);
+</pre>
+<ul>
+
+<li>
+<p> Call the NonURLDetector UDF to remove records if the query field is empty or a URL. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+clean1 = FILTER raw BY org.apache.pig.tutorial.NonURLDetector(query);
+</pre>
+<ul>
+
+<li>
+<p> Call the ToLower UDF to change the query field to lowercase. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+clean2 = FOREACH clean1 GENERATE user, time, org.apache.pig.tutorial.ToLower(query) as query;
+</pre>
+<ul>
+
+<li>
+<p> Because the log file only contains queries for a single day, we are only interested in the hour. The excite query log timestamp format is YYMMDDHHMMSS. Call the ExtractHour UDF to extract the hour from the time field. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+houred = FOREACH clean2 GENERATE user, org.apache.pig.tutorial.ExtractHour(time) as hour, query;
+</pre>
+<ul>
+
+<li>
+<p> Call the NGramGenerator UDF to compose the n-grams of the query. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+ngramed1 = FOREACH houred GENERATE user, hour, flatten(org.apache.pig.tutorial.NGramGenerator(query)) as ngram;
+</pre>
+<ul>
+
+<li>
+<p> Use the DISTINCT operator to get the unique n-grams for all records.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+ngramed2 = DISTINCT ngramed1;
+</pre>
+<ul>
+
+<li>
+<p> Use the GROUP operator to group the records by n-gram and hour.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour_frequency1 = GROUP ngramed2 BY (ngram, hour);
+</pre>
+<ul>
+
+<li>
+<p> Use the COUNT function to get the count (occurrences) of each n-gram.  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour_frequency2 = FOREACH hour_frequency1 GENERATE flatten($0), COUNT($1) as count;
+</pre>
+<ul>
+
+<li>
+<p> Use the FOREACH-GENERATE operator to assign names to the fields. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour_frequency3 = FOREACH hour_frequency2 GENERATE $0 as ngram, $1 as hour, $2 as count;
+</pre>
+<ul>
+
+<li>
+<p> Use the  FILTERoperator to get the n-grams for hour &lsquo;00&rsquo;  </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour00 = FILTER hour_frequency2 BY hour eq '00';
+</pre>
+<ul>
+
+<li>
+<p> Uses the FILTER operators to get the n-grams for hour &lsquo;12&rsquo; </p>
+
+</li>
+
+</ul>
+<pre class="code">
+hour12 = FILTER hour_frequency3 BY hour eq '12';
+</pre>
+<ul>
+
+<li>
+<p> Use the JOIN operator to get the n-grams that appear in both hours. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+same = JOIN hour00 BY $0, hour12 BY $0;
+</pre>
+<ul>
+
+<li>
+<p> Use the FOREACH-GENERATE operator to record their frequency. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+same1 = FOREACH same GENERATE hour_frequency2::hour00::group::ngram as ngram, $2 as count00, $5 as count12;
+</pre>
+<ul>
+
+<li>
+<p> Use the PigStorage function to store the results. The output file contains a list of n-grams with the following fields: <strong>ngram</strong>, <strong>count00</strong>, <strong>count12</strong>. </p>
+
+</li>
+
+</ul>
+<pre class="code">
+STORE same1 INTO '/tmp/tutorial-join-results' USING PigStorage();
+</pre>
+</div>
+
+
+
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2007-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Added: pig/site/publish/docs/r0.11.1/start.pdf
URL: http://svn.apache.org/viewvc/pig/site/publish/docs/r0.11.1/start.pdf?rev=1463367&view=auto
==============================================================================
Binary file - no diff available.

Propchange: pig/site/publish/docs/r0.11.1/start.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: pig/site/publish/docs/r0.11.1/test.html
URL: http://svn.apache.org/viewvc/pig/site/publish/docs/r0.11.1/test.html?rev=1463367&view=auto
==============================================================================
--- pig/site/publish/docs/r0.11.1/test.html (added)
+++ pig/site/publish/docs/r0.11.1/test.html Tue Apr  2 03:32:39 2013
@@ -0,0 +1,1406 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.9">
+<meta name="Forrest-skin-name" content="pelt">
+<title>Testing and Diagnostics</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a> &gt; <a href="http://hadoop.apache.org/">Hadoop</a> &gt; <a href="http://hadoop.apache.org/pig/">Pig</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://hadoop.apache.org/"><img class="logoImage" alt="Hadoop" src="images/hadoop-logo.jpg" title="Apache Hadoop"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://hadoop.apache.org/pig/"><img class="logoImage" alt="Pig" src="images/pig-logo.gif" title="A platform for analyzing large datasets."></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.google.com/search" method="get" class="roundtopsmall">
+<input value="" name="sitesearch" type="hidden"><input onFocus="getBlank (this, 'Search the site with google');" size="25" name="q" id="query" type="text" value="Search the site with google">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li>
+<a class="unselected" href="http://hadoop.apache.org/pig/">Project</a>
+</li>
+<li>
+<a class="unselected" href="http://wiki.apache.org/pig/">Wiki</a>
+</li>
+<li class="current">
+<a class="selected" href="index.html">Pig 0.11.1 Documentation</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_selected_1.1', 'skin/')" id="menu_selected_1.1Title" class="menutitle" style="background-image: url('skin/images/chapter_open.gif');">Pig</div>
+<div id="menu_selected_1.1" class="selectedmenuitemgroup" style="display: block;">
+<div class="menuitem">
+<a href="index.html">Overview</a>
+</div>
+<div class="menuitem">
+<a href="start.html">Getting Started</a>
+</div>
+<div class="menuitem">
+<a href="basic.html">Pig Latin Basics</a>
+</div>
+<div class="menuitem">
+<a href="func.html">Built In Functions</a>
+</div>
+<div class="menuitem">
+<a href="udf.html">User Defined Functions</a>
+</div>
+<div class="menuitem">
+<a href="cont.html">Control Structures</a>
+</div>
+<div class="menuitem">
+<a href="cmds.html">Shell and Utililty Commands</a>
+</div>
+<div class="menuitem">
+<a href="perf.html">Performance and Efficiency</a>
+</div>
+<div class="menupage">
+<div class="menupagetitle">Testing and Diagnostics</div>
+</div>
+<div class="menuitem">
+<a href="pig-index.html">Index</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Miscellaneous</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="api/">API Docs</a>
+</div>
+<div class="menuitem">
+<a href="jdiff/changes.html">API Changes</a>
+</div>
+<div class="menuitem">
+<a href="https://cwiki.apache.org/confluence/display/PIG">Wiki</a>
+</div>
+<div class="menuitem">
+<a href="https://cwiki.apache.org/confluence/display/PIG/FAQ">FAQ</a>
+</div>
+<div class="menuitem">
+<a href="http://hadoop.apache.org/pig/releases.html">Release Notes</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="test.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Testing and Diagnostics</h1>
+<div id="front-matter">
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#diagnostic-ops">Diagnostic Operators</a>
+<ul class="minitoc">
+<li>
+<a href="#describe">DESCRIBE</a>
+</li>
+<li>
+<a href="#dump">DUMP</a>
+</li>
+<li>
+<a href="#explain">EXPLAIN</a>
+</li>
+<li>
+<a href="#illustrate">ILLUSTRATE</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#mapreduce-job-ids">Pig Scripts and MapReduce Job IDs</a>
+</li>
+<li>
+<a href="#pig-statistics">Pig Statistics</a>
+<ul class="minitoc">
+<li>
+<a href="#Java+API">Java API</a>
+</li>
+<li>
+<a href="#Job+XML">Job XML</a>
+</li>
+<li>
+<a href="#hadoop-job-history-loader">Hadoop Job History Loader</a>
+</li>
+<li>
+<a href="#Examples">Examples</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#ppnl">Pig Progress Notification Listener</a>
+</li>
+<li>
+<a href="#pigunit">PigUnit</a>
+<ul class="minitoc">
+<li>
+<a href="#Build+PigUnit">Build PigUnit</a>
+</li>
+<li>
+<a href="#Run+PigUnit">Run PigUnit</a>
+</li>
+<li>
+<a href="#PigUnit+Example">PigUnit Example</a>
+</li>
+<li>
+<a href="#Troubleshooting+Tips">Troubleshooting Tips</a>
+</li>
+<li>
+<a href="#Future+Enhancements">Future Enhancements</a>
+</li>
+</ul>
+</li>
+</ul>
+</div>
+</div>
+
+<!-- =========================================================================== -->
+<!-- DIAGNOSTIC OPERATORS -->    
+
+<a name="diagnostic-ops"></a>
+<h2 class="h3">Diagnostic Operators</h2>
+<div class="section">
+<a name="describe"></a>
+<h3 class="h4">DESCRIBE</h3>
+<p>Returns the schema of a relation.</p>
+<a name="Syntax"></a>
+<h4>Syntax</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr> 
+            
+<td colspan="1" rowspan="1">
+               
+<p>DESCRIBE alias;&nbsp; &nbsp; &nbsp; &nbsp; </p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Terms"></a>
+<h4>Terms</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>alias</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>The name of a relation.</p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Usage"></a>
+<h4>Usage</h4>
+<p>Use the DESCRIBE operator to view the schema of a relation. 
+   You can view outer relations as well as relations defined in a nested FOREACH statement.</p>
+<a name="Example"></a>
+<h4>Example</h4>
+<p>In this example a schema is specified using the AS clause. If all data conforms to the schema, Pig will use the assigned types.</p>
+<pre class="code">
+A = LOAD 'student' AS (name:chararray, age:int, gpa:float);
+
+B = FILTER A BY name matches 'J.+';
+
+C = GROUP B BY name;
+
+D = FOREACH B GENERATE COUNT(B.age);
+
+DESCRIBE A;
+A: {group, B: (name: chararray,age: int,gpa: float}
+
+DESCRIBE B;
+B: {group, B: (name: chararray,age: int,gpa: float}
+
+DESCRIBE C;
+C: {group, chararry,B: (name: chararray,age: int,gpa: float}
+
+DESCRIBE D;
+D: {long}
+</pre>
+<p>In this example no schema is specified. All fields default to type bytearray or long (see Data Types).</p>
+<pre class="code">
+a = LOAD 'student';
+
+b = FILTER a BY $0 matches 'J.+';
+
+c = GROUP b BY $0;
+
+d = FOREACH c GENERATE COUNT(b.$1);
+
+DESCRIBE a;
+Schema for a unknown.
+
+DESCRIBE b;
+2008-12-05 01:17:15,316 [main] WARN  org.apache.pig.PigServer - bytearray is implicitly cast to chararray under LORegexp Operator
+Schema for b unknown.
+
+DESCRIBE c;
+2008-12-05 01:17:23,343 [main] WARN  org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
+c: {group: bytearray,b: {null}}
+
+DESCRIBE d;
+2008-12-05 03:04:30,076 [main] WARN  org.apache.pig.PigServer - bytearray is implicitly caste to chararray under LORegexp Operator
+d: {long}
+</pre>
+<p>This example shows how to view the schema of a nested relation using the :: operator.</p>
+<pre class="code">
+A = LOAD 'studentab10k' AS (name, age, gpa); 
+B = GROUP A BY name; 
+C = FOREACH B { 
+     D = DISTINCT A.age; 
+     GENERATE COUNT(D), group;} 
+
+DESCRIBE C::D; 
+D: {age: bytearray} 
+</pre>
+<a name="dump"></a>
+<h3 class="h4">DUMP</h3>
+<p>Dumps or displays results to screen.</p>
+<a name="Syntax-N10084"></a>
+<h4>Syntax</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr> 
+            
+<td colspan="1" rowspan="1">
+               
+<p>DUMP alias;&nbsp; &nbsp; &nbsp; &nbsp; </p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Terms-N10098"></a>
+<h4>Terms</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>alias</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>The name of a relation.</p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Usage-N100B4"></a>
+<h4>Usage</h4>
+<p>Use the DUMP operator to run (execute) Pig Latin statements and display the results to your screen. DUMP is meant for interactive mode; statements are executed immediately and the results are not saved (persisted). You can use DUMP as a debugging device to make sure that the results you are expecting are actually generated. </p>
+<p>
+   Note that production scripts SHOULD NOT use DUMP as it will disable multi-query optimizations and is likely to slow down execution 
+   (see <a href="perf.html#Store-Dump">Store vs. Dump</a>).
+   </p>
+<a name="Example-N100C5"></a>
+<h4>Example</h4>
+<p>In this example a dump is performed after each statement.</p>
+<pre class="code">
+A = LOAD 'student' AS (name:chararray, age:int, gpa:float);
+
+DUMP A;
+(John,18,4.0F)
+(Mary,19,3.7F)
+(Bill,20,3.9F)
+(Joe,22,3.8F)
+(Jill,20,4.0F)
+
+B = FILTER A BY name matches 'J.+';
+
+DUMP B;
+(John,18,4.0F)
+(Joe,22,3.8F)
+(Jill,20,4.0F)
+</pre>
+<a name="explain"></a>
+<h3 class="h4">EXPLAIN</h3>
+<p>Displays execution plans.</p>
+<a name="Syntax-N100DE"></a>
+<h4>Syntax</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr> 
+            
+<td colspan="1" rowspan="1">
+               
+<p>EXPLAIN [&ndash;script pigscript] [&ndash;out path] [&ndash;brief] [&ndash;dot] [&ndash;param param_name = param_value] [&ndash;param_file file_name] alias;&nbsp;</p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Terms-N100F2"></a>
+<h4>Terms</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+    
+         
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>&ndash;script</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>Use to specify a Pig script.</p>
+            
+</td>
+         
+</tr>      
+
+         
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>&ndash;out</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>Use to specify the output path (directory).</p>
+               
+<p>Will generate a logical_plan[.txt|.dot], physical_plan[.text|.dot], exec_plan[.text|.dot] file in the specified path.</p>
+               
+<p>Default (no path specified): Stdout </p>
+            
+</td>
+         
+</tr>
+
+         
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>&ndash;brief</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>Does not expand nested plans (presenting a smaller graph for overview). </p>
+            
+</td>
+         
+</tr>
+         
+         
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>&ndash;dot</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+
+               
+<p>Text mode (default): multiple output (split) will be broken out in sections.  </p>
+               
+<p>Dot mode: outputs a format that can be passed to the dot utility for graphical display &ndash; 
+               will generate a directed-acyclic-graph (DAG) of the plans in any supported format (.gif, .jpg ...).</p>
+            
+</td>
+         
+</tr>
+
+         
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>&ndash;param param_name = param_value</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>See <a href="cont.html#Parameter-Sub">Parameter Substitution</a>.</p>
+            
+</td>
+         
+</tr>
+
+         
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>&ndash;param_file file_name</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>See <a href="cont.html#Parameter-Sub">Parameter Substitution</a>. </p>
+            
+</td>
+         
+</tr>
+      
+      
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>alias</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>The name of a relation.</p>
+            
+</td>
+         
+</tr>
+   
+</table>
+<a name="execution-plans"></a>
+<h4>Usage</h4>
+<p>Use the EXPLAIN operator to review the logical, physical, and map reduce execution plans that are used to compute the specified relationship. </p>
+<p>If no script is given:</p>
+<ul>	
+      
+<a name="logical-plan"></a>
+<li id="logical-plan">
+         
+<p>The logical plan shows a pipeline of operators to be executed to build the relation. Type checking and backend-independent optimizations (such as applying filters early on) also apply.</p>
+      
+</li>
+      
+<a name="physical-plan"></a>
+<li id="physical-plan">
+         
+<p>The physical plan shows how the logical operators are translated to backend-specific physical operators. Some backend optimizations also apply.</p>
+      
+</li>
+      
+<a name="mapreduce-plan"></a>
+<li id="mapreduce-plan">
+         
+<p>The mapreduce plan shows how the physical operators are grouped into map reduce jobs.</p>
+      
+</li>
+  
+</ul>
+<p></p>
+<p>If a script without an alias is specified, it will output the entire execution graph (logical, physical, or map reduce). </p>
+<p>If a script with a alias is specified, it will output the plan for the given alias. </p>
+<a name="Example-N101BE"></a>
+<h4>Example</h4>
+<p>In this example the EXPLAIN operator produces all three plans. (Note that only a portion of the output is shown in this example.)</p>
+<pre class="code">
+A = LOAD 'student' AS (name:chararray, age:int, gpa:float);
+
+B = GROUP A BY name;
+
+C = FOREACH B GENERATE COUNT(A.age);
+
+EXPLAIN C;
+-----------------------------------------------
+Logical Plan:
+-----------------------------------------------
+Store xxx-Fri Dec 05 19:42:29 UTC 2008-23 Schema: {long} Type: Unknown
+|
+|---ForEach xxx-Fri Dec 05 19:42:29 UTC 2008-15 Schema: {long} Type: bag
+ <em>etc ... </em> 
+
+-----------------------------------------------
+Physical Plan:
+-----------------------------------------------
+Store(fakefile:org.apache.pig.builtin.PigStorage) - xxx-Fri Dec 05 19:42:29 UTC 2008-40
+|
+|---New For Each(false)[bag] - xxx-Fri Dec 05 19:42:29 UTC 2008-39
+    |   |
+    |   POUserFunc(org.apache.pig.builtin.COUNT)[long] - xxx-Fri Dec 05 
+ <em>etc ... </em> 
+
+--------------------------------------------------
+| Map Reduce Plan                               
+-------------------------------------------------
+MapReduce node xxx-Fri Dec 05 19:42:29 UTC 2008-41
+Map Plan
+Local Rearrange[tuple]{chararray}(false) - xxx-Fri Dec 05 19:42:29 UTC 2008-34
+|   |
+|   Project[chararray][0] - xxx-Fri Dec 05 19:42:29 UTC 2008-35
+ <em>etc ... </em> 
+
+
+</pre>
+<a name="illustrate"></a>
+<h3 class="h4">ILLUSTRATE</h3>
+<p>Displays a step-by-step execution of a sequence of statements.</p>
+<a name="Syntax-N101E0"></a>
+<h4>Syntax</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr> 
+            
+<td colspan="1" rowspan="1">
+               
+<p>ILLUSTRATE {alias | -script scriptfile};&nbsp;</p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Terms-N101F4"></a>
+<h4>Terms</h4>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+      
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>alias</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>The name of a relation.</p>
+            
+</td>
+         
+</tr> 
+            
+      
+<tr>
+            
+<td colspan="1" rowspan="1">
+               
+<p>-script scriptfile</p>
+            
+</td>
+            <td colspan="1" rowspan="1">
+               
+<p>The script keyword followed by the name of a Pig script (for example, myscript.pig). </p>
+               
+<p>The script file should not contain an ILLUSTRATE statement.</p>
+            
+</td>
+         
+</tr> 
+   
+</table>
+<a name="Usage-N10226"></a>
+<h4>Usage</h4>
+<p>Use the ILLUSTRATE operator to review how data is transformed through a sequence of Pig Latin statements.
+   ILLUSTRATE allows you to test your programs on small datasets and get faster turnaround times. </p>
+<a name="example-generator"></a>
+<p id="example-generator">ILLUSTRATE is based on an example generator 
+(see <a href="http://research.yahoo.com/files/paper_5.pdf">Generating Example Data for Dataflow Programs</a>).
+
+The algorithm works by retrieving a small sample of the input data and then propagating this data through the pipeline. However, some operators, such as JOIN and FILTER, can eliminate tuples from the data - and this could result in no data following through the pipeline. To address this issue, the algorithm will automatically generate example data, in near real-time. Thus, you might see data propagating through the pipeline that was not found in the original input data, but this data changes nothing and ensures that you will be able to examine the semantics of your Pig Latin statements.</p>
+<p>As shown in the examples below, you can use ILLUSTRATE to review a relation or an entire Pig script.</p>
+<a name="Example+-+Relation"></a>
+<h4>Example - Relation</h4>
+<p>This example demonstrates how to use ILLUSTRATE with a relation. Note that the LOAD statement must include a schema (the AS clause).</p>
+<pre class="code">
+grunt&gt; visits = LOAD 'visits.txt' AS (user:chararray, url:chararray, timestamp:chararray);
+grunt&gt; DUMP visits;
+
+(Amy,yahoo.com,19990421)
+(Fred,harvard.edu,19991104)
+(Amy,cnn.com,20070218)
+(Frank,nba.com,20070305)
+(Fred,berkeley.edu,20071204)
+(Fred,stanford.edu,20071206)
+
+grunt&gt; recent_visits = FILTER visits BY timestamp &gt;= '20071201';
+grunt&gt; user_visits = GROUP recent_visits BY user;
+grunt&gt; num_user_visits = FOREACH user_visits GENERATE group, COUNT(recent_visits);
+grunt&gt; DUMP num_user_visits;
+
+(Fred,2)
+
+grunt&gt; ILLUSTRATE num_user_visits;
+------------------------------------------------------------------------
+| visits     | user: chararray | url: chararray | timestamp: chararray |
+------------------------------------------------------------------------
+|            | Fred            | berkeley.edu   | 20071204             |
+|            | Fred            | stanford.edu   | 20071206             |
+|            | Frank           | nba.com        | 20070305             |
+------------------------------------------------------------------------
+-------------------------------------------------------------------------------
+| recent_visits     | user: chararray | url: chararray | timestamp: chararray |
+-------------------------------------------------------------------------------
+|                   | Fred            | berkeley.edu   | 20071204             |
+|                   | Fred            | stanford.edu   | 20071206             |
+-------------------------------------------------------------------------------
+------------------------------------------------------------------------------------------------------------------
+| user_visits     | group: chararray | recent_visits: bag({user: chararray,url: chararray,timestamp: chararray}) |
+------------------------------------------------------------------------------------------------------------------
+|                 | Fred             | {(Fred, berkeley.edu, 20071204), (Fred, stanford.edu, 20071206)}          |
+------------------------------------------------------------------------------------------------------------------
+--------------------------------------------------
+| num_user_visits     | group: chararray | long  |
+--------------------------------------------------
+|                     | Fred             | 2     |
+--------------------------------------------------
+</pre>
+<a name="Example+-+Script"></a>
+<h4>Example - Script</h4>
+<p>This example demonstrates how to use ILLUSTRATE with a Pig script. Note that the script itself should not contain an ILLUSTRATE statement.</p>
+<pre class="code">
+grunt&gt; cat visits.txt
+Amy     yahoo.com       19990421
+Fred    harvard.edu     19991104
+Amy     cnn.com 20070218
+Frank   nba.com 20070305
+Fred    berkeley.edu    20071204
+Fred    stanford.edu    20071206
+
+grunt&gt; cat visits.pig
+visits = LOAD 'visits.txt' AS (user, url, timestamp);
+recent_visits = FILTER visits BY timestamp &gt;= '20071201';
+historical_visits = FILTER visits BY timestamp &lt;= '20000101';
+DUMP recent_visits;
+DUMP historical_visits;
+STORE recent_visits INTO 'recent';
+STORE historical_visits INTO 'historical';
+
+grunt&gt; exec visits.pig
+
+(Fred,berkeley.edu,20071204)
+(Fred,stanford.edu,20071206)
+
+(Amy,yahoo.com,19990421)
+(Fred,harvard.edu,19991104)
+
+
+grunt&gt; illustrate -script visits.pig
+
+------------------------------------------------------------------------
+| visits     | user: bytearray | url: bytearray | timestamp: bytearray |
+------------------------------------------------------------------------
+|            | Amy             | yahoo.com      | 19990421             |
+|            | Fred            | stanford.edu   | 20071206             |
+------------------------------------------------------------------------
+-------------------------------------------------------------------------------
+| recent_visits     | user: bytearray | url: bytearray | timestamp: bytearray |
+-------------------------------------------------------------------------------
+|                   | Fred            | stanford.edu   | 20071206             |
+-------------------------------------------------------------------------------
+---------------------------------------------------------------------------------------
+| Store : recent_visits     | user: bytearray | url: bytearray | timestamp: bytearray |
+---------------------------------------------------------------------------------------
+|                           | Fred            | stanford.edu   | 20071206             |
+---------------------------------------------------------------------------------------
+-----------------------------------------------------------------------------------
+| historical_visits     | user: bytearray | url: bytearray | timestamp: bytearray |
+-----------------------------------------------------------------------------------
+|                       | Amy             | yahoo.com      | 19990421             |
+-----------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------
+| Store : historical_visits     | user: bytearray | url: bytearray | timestamp: bytearray |
+-------------------------------------------------------------------------------------------
+|                               | Amy             | yahoo.com      | 19990421             |
+-------------------------------------------------------------------------------------------
+</pre>
+</div>
+
+<!-- =========================================================================== -->
+<!-- DIAGNOSTIC OPERATORS -->    
+
+<a name="mapreduce-job-ids"></a>
+<h2 class="h3">Pig Scripts and MapReduce Job IDs</h2>
+<div class="section">
+<p>Complex Pig scripts often generate many MapReduce jobs. To help you debug a script, Pig prints a summary of the execution that shows which relations (aliases) are mapped to each MapReduce job. </p>
+<pre class="code">
+JobId Maps Reduces MaxMapTime MinMapTIme AvgMapTime MaxReduceTime 
+    MinReduceTime AvgReduceTime Alias Feature Outputs
+job_201004271216_12712 1 1 3 3 3 12 12 12 B,C GROUP_BY,COMBINER
+job_201004271216_12713 1 1 3 3 3 12 12 12 D SAMPLER
+job_201004271216_12714 1 1 3 3 3 12 12 12 D ORDER_BY,COMBINER 
+    hdfs://mymachine.com:9020/tmp/temp743703298/tmp-2019944040,
+</pre>
+</div>
+
+<!-- ==================================================================== -->
+<!-- PIG STATISTICS-->
+
+<a name="pig-statistics"></a>
+<h2 class="h3">Pig Statistics</h2>
+<div class="section">
+<p>Pig Statistics is a framework for collecting and storing script-level statistics for Pig Latin. Characteristics of Pig Latin scripts and the resulting MapReduce jobs are collected while the script is executed. These statistics are then available for Pig users and tools using Pig (such as Oozie) to retrieve after the job is done.</p>
+<p>The new Pig statistics and the existing Hadoop statistics can also be accessed via the Hadoop job history file (and job xml file). Piggybank has a HadoopJobHistoryLoader which acts as an example of using Pig itself to query these statistics (the loader can be used as a reference implementation but is NOT supported for production use).</p>
+<a name="Java+API"></a>
+<h3 class="h4">Java API</h3>
+<p>Several new public classes make it easier for external tools such as Oozie to integrate with Pig statistics. </p>
+<p>The Pig statistics are available here: <a href="http://pig.apache.org/docs/r0.9.0/api/">http://pig.apache.org/docs/r0.9.0/api/</a>
+</p>
+<a name="stats-classes"></a>
+<p id="stats-classes">The stats classes are in the package: org.apache.pig.tools.pigstats</p>
+<ul>
+
+<li>PigStats</li>
+
+<li>JobStats</li>
+
+<li>OutputStats</li>
+
+<li>InputStats</li>
+
+</ul>
+<p></p>
+<p>The PigRunner class mimics the behavior of the Main class but gives users a statistics object back. Optionally, you can call the API with an implementation of progress listener which will be invoked by Pig runtime during the execution. </p>
+<pre class="code">
+package org.apache.pig;
+
+public abstract class PigRunner {
+    public static PigStats run(String[] args, PigProgressNotificationListener listener)
+}
+
+public interface PigProgressNotificationListener extends java.util.EventListener {
+    // just before the launch of MR jobs for the script
+    public void LaunchStartedNotification(int numJobsToLaunch);
+    // number of jobs submitted in a batch
+    public void jobsSubmittedNotification(int numJobsSubmitted);
+    // a job is started
+    public void jobStartedNotification(String assignedJobId);
+    // a job is completed successfully
+    public void jobFinishedNotification(JobStats jobStats);
+    // a job is failed
+    public void jobFailedNotification(JobStats jobStats);
+    // a user output is completed successfully
+    public void outputCompletedNotification(OutputStats outputStats);
+    // updates the progress as percentage
+    public void progressUpdatedNotification(int progress);
+    // the script execution is done
+    public void launchCompletedNotification(int numJobsSucceeded);
+}
+</pre>
+<a name="Job+XML"></a>
+<h3 class="h4">Job XML</h3>
+<p>The following entries are included in job conf: </p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p> 
+<strong>Pig Statistic</strong> 
+</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p> 
+<strong>Description</strong>
+</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-script-id"></a>
+<p id="pig-script-id">pig.script.id</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>The UUID for the script. All jobs spawned by the script have the same script ID.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-script"></a>
+<p id="pig-script">pig.script</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>The base64 encoded script text.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-command-line"></a>
+<p id="pig-command-line">pig.command.line</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>The command line used to invoke the script.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-hadoop-version"></a>
+<p id="pig-hadoop-version">pig.hadoop.version</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>The Hadoop version installed.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-version"></a>
+<p id="pig-version">pig.version</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>The Pig version used.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-input-dirs"></a>
+<p id="pig-input-dirs">pig.input.dirs</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>A comma-separated list of input directories for the job.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-map-output-dirs"></a>
+<p id="pig-map-output-dirs">pig.map.output.dirs</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>A comma-separated list of output directories in the map phase of the job.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-reduce-output-dirs"></a>
+<p id="pig-reduce-output-dirs">pig.reduce.output.dirs</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>A comma-separated list of output directories in the reduce phase of the job.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-parent-jobid"></a>
+<p id="pig-parent-jobid">pig.parent.jobid</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>A comma-separated list of parent job ids.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-script-features"></a>
+<p id="pig-script-features">pig.script.features</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>A list of Pig features used in the script.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-job-feature"></a>
+<p id="pig-job-feature">pig.job.feature</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>A list of Pig features used in the job.</p>
+
+</td>
+
+</tr>
+
+<tr>
+
+<td colspan="1" rowspan="1">
+<a name="pig-alias"></a>
+<p id="pig-alias">pig.alias</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>The alias associated with the job.</p>
+
+</td>
+
+</tr>
+
+</table>
+<a name="hadoop-job-history-loader"></a>
+<h3 class="h4">Hadoop Job History Loader</h3>
+<p>The HadoopJobHistoryLoader in Piggybank loads Hadoop job history files and job xml files from file system. For each MapReduce job, the loader produces a tuple with schema (j:map[], m:map[], r:map[]). The first map in the schema contains job-related entries. Here are some of important key names in the map: </p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+
+<tr>
+
+<td colspan="1" rowspan="1">
+
+<p>PIG_SCRIPT_ID</p>
+
+<p>CLUSTER </p>
+
+<p>QUEUE_NAME</p>
+
+<p>JOBID</p>
+
+<p>JOBNAME</p>
+
+<p>STATUS</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>USER </p>
+
+<p>HADOOP_VERSION  </p>
+
+<p>PIG_VERSION</p>
+
+<p>PIG_JOB_FEATURE</p>
+
+<p>PIG_JOB_ALIAS </p>
+
+<p>PIG_JOB_PARENTS</p>
+
+</td>
+<td colspan="1" rowspan="1">
+
+<p>SUBMIT_TIME</p>
+
+<p>LAUNCH_TIME</p>
+
+<p>FINISH_TIME</p>
+
+<p>TOTAL_MAPS</p>
+
+<p>TOTAL_REDUCES</p>
+
+</td>
+
+</tr>
+
+</table>
+<p></p>
+<p>Examples that use the loader to query Pig statistics are shown below.</p>
+<a name="Examples"></a>
+<h3 class="h4">Examples</h3>
+<p>Find scripts that generate more then three MapReduce jobs:</p>
+<pre class="code">
+a = load '/mapred/history/done' using HadoopJobHistoryLoader() as (j:map[], m:map[], r:map[]);
+b = group a by (j#'PIG_SCRIPT_ID', j#'USER', j#'JOBNAME');
+c = foreach b generate group.$1, group.$2, COUNT(a);
+d = filter c by $2 &gt; 3;
+dump d;
+</pre>
+<p>Find the running time of each script (in seconds): </p>
+<pre class="code">
+a = load '/mapred/history/done' using HadoopJobHistoryLoader() as (j:map[], m:map[], r:map[]);
+b = foreach a generate j#'PIG_SCRIPT_ID' as id, j#'USER' as user, j#'JOBNAME' as script_name, 
+         (Long) j#'SUBMIT_TIME' as start, (Long) j#'FINISH_TIME' as end;
+c = group b by (id, user, script_name)
+d = foreach c generate group.user, group.script_name, (MAX(b.end) - MIN(b.start)/1000;
+dump d;
+</pre>
+<p>Find the number of scripts run by user and queue on a cluster: </p>
+<pre class="code">
+a = load '/mapred/history/done' using HadoopJobHistoryLoader() as (j:map[], m:map[], r:map[]);
+b = foreach a generate j#'PIG_SCRIPT_ID' as id, j#'USER' as user, j#'QUEUE_NAME' as queue;
+c = group b by (id, user, queue) parallel 10;
+d = foreach c generate group.user, group.queue, COUNT(b);
+dump d;
+</pre>
+<p>Find scripts that have failed jobs: </p>
+<pre class="code">
+a = load '/mapred/history/done' using HadoopJobHistoryLoader() as (j:map[], m:map[], r:map[]);
+b = foreach a generate (Chararray) j#'STATUS' as status, j#'PIG_SCRIPT_ID' as id, j#'USER' as user, j#'JOBNAME' as script_name, j#'JOBID' as job;
+c = filter b by status != 'SUCCESS';
+dump c;
+</pre>
+<p>Find scripts that use only the default parallelism: </p>
+<pre class="code">
+a = load '/mapred/history/done' using HadoopJobHistoryLoader() as (j:map[], m:map[], r:map[]);
+b = foreach a generate j#'PIG_SCRIPT_ID' as id, j#'USER' as user, j#'JOBNAME' as script_name, (Long) r#'NUMBER_REDUCES' as reduces;
+c = group b by (id, user, script_name) parallel 10;
+d = foreach c generate group.user, group.script_name, MAX(b.reduces) as max_reduces;
+e = filter d by max_reduces == 1;
+dump e;
+</pre>
+</div>   
+
+<!-- =========================================================================== -->
+<!-- PIG PROGRESS NOTIFICATION LISTENER -->
+
+<a name="ppnl"></a>
+<h2 class="h3">Pig Progress Notification Listener</h2>
+<div class="section">
+<p>
+Pig provides the ability to register a listener to receive event notifications during the
+execution of a script. Events include MapReduce plan creation, script launch, script progress,
+script completion, job submit, job start, job completion and job failure.
+</p>
+<p>To register a listener, set the pig.notification.listener parameter
+to the fully qualified class name of an implementation of
+<a href="http://svn.apache.org/repos/asf/pig/trunk/src/org/apache/pig/tools/pigstats/PigProgressNotificationListener.java">org.apache.pig.tools.pigstats.PigProgressNotificationListener</a>.
+The class must exist on the classpath of the process submitting the Pig job. If the
+pig.notification.listener.arg parameter is set, the value will be passed to a constructor
+of the implementing class that takes a single String.
+ </p>
+</div>
+
+<!-- =========================================================================== -->
+<!-- PIGUNIT -->    
+
+  
+<a name="pigunit"></a>
+<h2 class="h3">PigUnit</h2>
+<div class="section">
+<p>PigUnit is a simple xUnit framework that enables you to easily test your Pig scripts.
+        With PigUnit you can perform unit testing, regression testing, and rapid prototyping. 
+        No cluster set up is required if you run Pig in local mode.
+      </p>
+<a name="Build+PigUnit"></a>
+<h3 class="h4">Build PigUnit</h3>
+<p>To compile PigUnit run the command shown below from the Pig trunk. The compile will create the pigunit.jar file.</p>
+<pre class="code">
+$pig_trunk ant pigunit-jar   
+</pre>
+<a name="Run+PigUnit"></a>
+<h3 class="h4">Run PigUnit</h3>
+<p>You can run PigUnit using Pig's local mode or mapreduce mode.</p>
+<a name="Local+Mode"></a>
+<h4>Local Mode</h4>
+<p>
+        PigUnit runs in Pig's local mode by default.
+        Local mode is fast and enables you to use your local file system as the HDFS cluster.
+        Local mode does not require a real cluster but a new local one is created each time. 
+      </p>
+<a name="Mapreduce+Mode"></a>
+<h4>Mapreduce Mode</h4>
+<p>PigUnit also runs in Pig's mapreduce mode. Mapreduce mode requires you to use a Hadoop cluster and HDFS installation.
+        It is enabled when the Java system property pigunit.exectype.cluster is set to any value: e.g. -Dpigunit.exectype.cluster=true or System.getProperties().setProperty("pigunit.exectype.cluster", "true"). The cluster you select must be specified in the CLASSPATH (similar to the HADOOP_CONF_DIR variable). 
+      </p>
+<a name="PigUnit+Example"></a>
+<h3 class="h4">PigUnit Example</h3>
+<p>
+        Many PigUnit examples are available in the
+        <a href="http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java">PigUnit tests</a>. 
+      </p>
+<p>The example included here computes the top N of the most common queries. 
+        The Pig script, top_queries.pig, is similar to the 
+        <a href="start.html#Pig-Script-1">Query Phrase Popularity</a> 
+        in the Pig tutorial. It expects an input a file of queries and a parameter n (n is 2 in our case in order to do a top 2). 
+      </p>
+<p>Setting up a test for this script is easy because the argument and the input data are
+        specified by two text arrays. It is the same for the expected output of the
+        script that will be compared to the actual result of the execution of the Pig script. 
+      </p>
+<a name="Java+Test"></a>
+<h4>Java Test</h4>
+<pre class="code">
+  @Test
+  public void testTop2Queries() {
+    String[] args = {
+        "n=2",
+        };
+ 
+    PigTest test = new PigTest("top_queries.pig", args);
+ 
+    String[] input = {
+        "yahoo",
+        "yahoo",
+        "yahoo",
+        "twitter",
+        "facebook",
+        "facebook",
+        "linkedin",
+    };
+ 
+    String[] output = {
+        "(yahoo,3)",
+        "(facebook,2)",
+    };
+ 
+    test.assertOutput("data", input, "queries_limit", output);
+  }
+</pre>
+<a name="top_queries.pig"></a>
+<h4>top_queries.pig</h4>
+<pre class="code">
+data =
+    LOAD 'input'
+    AS (query:CHARARRAY);
+     
+queries_group =
+    GROUP data
+    BY query; 
+    
+queries_count = 
+    FOREACH queries_group 
+    GENERATE 
+        group AS query, 
+        COUNT(data) AS total;
+        
+queries_ordered =
+    ORDER queries_count
+    BY total DESC, query;
+            
+queries_limit =
+    LIMIT queries_ordered $n;
+
+STORE queries_limit INTO 'output';
+</pre>
+<a name="Run"></a>
+<h4>Run</h4>
+<p>The test can be executed by JUnit (or any other Java testing framework). It requires:
+        </p>
+<ol>
+          
+<li>pig.jar</li>
+          
+<li>pigunit.jar</li>
+        
+</ol>
+<p>The test takes about 25s to run and should pass. In case of error (for example change the
+          parameter n to n=3), the diff of output is displayed:
+        </p>
+<pre class="code">
+junit.framework.ComparisonFailure: null expected:&lt;...ahoo,3)
+(facebook,2)[]&gt; but was:&lt;...ahoo,3)
+(facebook,2)[
+(linkedin,1)]&gt;
+        at junit.framework.Assert.assertEquals(Assert.java:81)
+        at junit.framework.Assert.assertEquals(Assert.java:87)
+        at org.apache.pig.pigunit.PigTest.assertEquals(PigTest.java:272)
+</pre>
+<a name="Troubleshooting+Tips"></a>
+<h3 class="h4">Troubleshooting Tips</h3>
+<p>Common problems you may encounter are discussed below.</p>
+<a name="Classpath+in+Mapreduce+Mode"></a>
+<h4>Classpath in Mapreduce Mode</h4>
+<p>When using PigUnit in mapreduce mode, be sure to include the $HADOOP_CONF_DIR of the
+          cluster in your CLASSPATH.</p>
+<p>
+          MiniCluster generates one in build/classes.
+        </p>
+<pre class="code">
+org.apache.pig.backend.executionengine.ExecException: 
+ERROR 4010: Cannot find hadoop configurations in classpath 
+(neither hadoop-site.xml nor core-site.xml was found in the classpath).
+If you plan to use local mode, please put -x local option in command line
+</pre>
+<a name="UDF+jars+Not+Found"></a>
+<h4>UDF jars Not Found</h4>
+<p>This error means that you are missing some jars in your test environment.</p>
+<pre class="code">
+WARN util.JarManager: Couldn't find the jar for 
+org.apache.pig.piggybank.evaluation.string.LOWER, skip it
+</pre>
+<a name="Storing+Data"></a>
+<h4>Storing Data</h4>
+<p>Pig currently drops all STORE and DUMP commands. You can tell PigUnit to keep the
+          commands and execute the script:</p>
+<pre class="code">
+test = new PigTest(PIG_SCRIPT, args);   
+test.unoverride("STORE");
+test.runScript();
+</pre>
+<a name="Cache+Archive"></a>
+<h4>Cache Archive</h4>
+<p>For cache archive to work, your test environment needs to have the cache archive options
+          specified by Java properties or in an additional XML configuration in its CLASSPATH.</p>
+<p>If you use a local cluster, you need to set the required environment variables before
+          starting it:</p>
+<pre class="code">export LD_LIBRARY_PATH=/home/path/to/lib</pre>
+<a name="Future+Enhancements"></a>
+<h3 class="h4">Future Enhancements</h3>
+<p>Improvements and other components based on PigUnit that could be built later.</p>
+<p>For example, we could build a PigTestCase and PigTestSuite on top of PigTest to:</p>
+<ol>
+        
+<li>Add the notion of workspaces for each test.</li>
+        
+<li>Remove the boiler plate code appearing when there is more than one test methods.</li>
+        
+<li>Add a standalone utility that reads test configurations and generates a test report.
+        </li>
+      
+</ol>
+</div>
+    
+
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2007-2013 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Added: pig/site/publish/docs/r0.11.1/test.pdf
URL: http://svn.apache.org/viewvc/pig/site/publish/docs/r0.11.1/test.pdf?rev=1463367&view=auto
==============================================================================
Binary file - no diff available.

Propchange: pig/site/publish/docs/r0.11.1/test.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: pig/site/publish/docs/r0.11.1/test1.html
URL: http://svn.apache.org/viewvc/pig/site/publish/docs/r0.11.1/test1.html?rev=1463367&view=auto
==============================================================================
--- pig/site/publish/docs/r0.11.1/test1.html (added)
+++ pig/site/publish/docs/r0.11.1/test1.html Tue Apr  2 03:32:39 2013
@@ -0,0 +1,37 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<html>
+  <head>
+		<title>Raw un-processed HTML page (test1)</title>
+  </head>
+	<body>
+		<h1>raw un-processed HTML page (test1)</h1>
+		<p>
+		This raw HTML page is linked to from xdocs/samples/static.xml
+    and from xdocs/samples/linking.xml
+		</p>
+		<p>All linked-to pages (for example: 
+     <a href="test2.html">&lt;a href="test2.html"&gt;</a>) are
+		also available.
+    </p>
+		<hr />
+    <p>
+		[return to <a href="index.html">Index</a>]<br>
+		[return to <a href="samples/linking.html">Linking demonstration</a>]
+    </p>
+	</body>
+</html>