You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ol...@apache.org on 2010/01/05 21:31:13 UTC

svn commit: r896204 - in /hadoop/pig/trunk: CHANGES.txt src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml src/docs/src/documentation/content/xdocs/zebra_pig.xml src/docs/src/documentation/content/xdocs/zebra_reference.xml

Author: olga
Date: Tue Jan  5 20:31:12 2010
New Revision: 896204

URL: http://svn.apache.org/viewvc?rev=896204&view=rev
Log:
PIG-1177: Pig 0.6 Docs - Zebra docs (chandec via olgan)

Modified:
    hadoop/pig/trunk/CHANGES.txt
    hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml
    hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml
    hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml

Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=896204&r1=896203&r2=896204&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Tue Jan  5 20:31:12 2010
@@ -24,6 +24,8 @@
 
 IMPROVEMENTS
 
+PIG-1177: Pig 0.6 Docs - Zebra docs (chandec via olgan)
+
 PIG-1175: Pig 0.6 Docs - Store v. Dump (chandec via olgan)
 
 PIG-1102: Collect number of spills per job (sriranjan via olgan)

Modified: hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml?rev=896204&r1=896203&r2=896204&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml (original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_mapreduce.xml Tue Jan  5 20:31:12 2010
@@ -45,14 +45,215 @@
     </section>
 <!-- END HADOOP M/R API--> 
 
+ <!-- ZEBRA API-->
+   <section>
+   <title>Zebra MapReduce APIs</title>
+    <p>Zebra includes several classes for use in MapReduce programs. The main entry point into Zebra are the two classes for reading and writing tables, namely TableInputFormat and BasicTableOutputFormat. </p>
+
+    	<section>
+         <title>BasicTableOutputFormat  </title>    	
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>yes</td>
+			<td>void setOutputPath(JobConf, Path)  </td>
+			<td>Set the output path of the BasicTable in JobConf  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>Path[] getOutputPaths(JobConf) </td>
+			<td>Get the output paths of the BasicTable from JobConf </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>void setStorageInfo(JobConf, ZebraSchema, ZebraStorageHint, ZebraSortInfo) </td>
+			<td>Set the table storage information (schema, storagehint, sortinfo) in JobConf</td>
+		</tr>
+			<tr>
+			<td>yes</td>
+			<td>Schema getSchema(JobConf)  </td>
+			<td>Get the table schema in JobConf  </td>
+		</tr>
+	    <tr>
+			<td>yes</td>
+			<td>BytesWritable generateSortKey(JobConf, Tuple)  </td>
+			<td>Generates a BytesWritable key for the input key </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>String getStorageHint(JobConf)  </td>
+			<td>Get the table storage hint in JobConf  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>SortInfo getSortInfo(JobConf)  </td>
+			<td>Get the SortInfo object  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>void close(JobConf)  </td>
+			<td>Close the output BasicTable, No more rows can be added into the table  </td>
+		</tr>
+	   <tr>
+			<td>yes</td>
+			<td>void setMultipleOutputs(JobConf, String commaSeparatedLocs, Class &lt; extends ZebraOutputPartition&gt; theClass)  </td>
+			<td>Enables data to be written to multiple zebra tables based on the ZebraOutputPartition class. 
+			See <a href="zebra_mapreduce.html#Multiple+Table+Outputs">Multiple Table Outputs.</a></td>
+		</tr>
+    	</table> 
+        </section>
+        
+          	<section>
+         <title>TableInputFormat   </title>    
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>yes</td>
+			<td>void setInputPaths(JobConf, Path... paths)  </td>
+			<td>Set the paths to the input table </td>
+
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>Path[] getInputPaths(JobConf)  </td>
+			<td>Get the comma-separated paths to the input table or table union  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>Schema getSchema(JobConf)  </td>
+			<td>Get the schema of a table expr  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>void setProjection(JobConf, ZebraProjection)  </td>
+			<td>Set the input projection in the JobConf object  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>String getProjection(JobConf) </td>
+			<td>Get the projection from the JobConf </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>SortInfo getSortInfo(JobConf)  </td>
+			<td>Get the SortInfo object regarding a Zebra table  </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>void requireSortedTable(JobConf, String sortcolumns, BytesComparator comparator) </td>
+			<td>Requires sorted table or table union </td>
+		</tr>
+	    <tr>
+			<td> yes </td>
+			<td>TableRecordReader getTableRecordReader(JobConf, ZebraProjection) </td>
+			<td>Get a TableRecordReader on a single split </td>
+		</tr>
+		<tr>
+			<td>yes</td>
+			<td>void setMinSplitSize(JobConf, long minSize) </td>
+			<td>Set the minimum split size, default of 1M bytes </td>
+		</tr>
+    	</table>
+    	</section>
+
+    <section>
+    <title>TableRecordReader </title>
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>no</td>
+			<td>boolean seekTo(BytesWritable key) </td>
+			<td>Seek to the position at the first row which has the key (returning true) or just after the key(returning false); only applicable for sorted Zebra table.  </td>
+		</tr>
+    	</table>
+     </section>
+     
+     
+     
+     <section>
+         <title>ZebraOutputPartition </title>
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td> no </td>
+			<td>public abstract int getOutputPartition(BytesWritable key, Tuple value) </td>
+			<td>Abstract method from ZebraOutputPartition abstract class. App implements this to stream data to different table  </td>
+		</tr>
+		<tr>
+			<td> no </td>
+			<td>void setConf(Configuration jobConf)  </td>
+			<td>Initialization routine giving JobConf to application. Zebra implements it  </td>
+		</tr>
+		<tr>
+			<td> no </td>
+			<td>Configuration getConf()  </td>
+			<td> returns JobConf. Zebra implements it</td>
+		</tr>
+		<tr>
+			<td>yes </td>
+			<td>Class&lt; extends ZebraOutputPartition&gt; getZebraOutputPartitionClass(JobConf conf) </td>
+			<td>return user implemented ZebraOutputPartition class  </td>
+		</tr>
+    	</table>
+   	   </section>
+   	   
+   	   
+    <section>
+    <title>ZebraProjection </title>
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>yes</td>
+			<td>ZebraProjection createZebraProjection(String) </td>
+			<td>Create a ZebraProjection object from a string representing projection information. </td>
+		</tr>
+    	</table>
+     </section>
+     
+    <section>
+    <title>ZebraSchema</title>
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>yes</td>
+			<td>ZebraSchema createZebraSchema(String)  </td>
+			<td>Create a ZebraStorageHint object from a string representing storage hint information.</td>
+		</tr>
+    	</table>
+     </section>     
+     
+    <section>
+    <title>ZebraStorageHint </title>
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>yes</td>
+			<td>ZebraStorageHint createZebraStorageHint(String) </td>
+			<td>Create a ZebraStorageHint object from a string representing storage hint information. </td>
+		</tr>
+    	</table>
+     </section>   
+     
+    <section>
+    <title>ZebraSortInfo </title>
+    	<table>
+		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
+		<tr>
+			<td>yes</td>
+			<td>ZebraSortInfo createZebraSortInfo(String sortColumns, Class&lt; extends RawComparator &lt; Object &gt;&gt; comparatorClass) </td>
+			<td>Create a ZebraSortInfo object from a sort columns string and a comparator class. </td>
+		</tr>
+    	</table>
+     </section>   
+         </section>
+ <!-- END ZEBRA API--> 
+
+
  
  <!-- ZEBRA M/R EXAMPLES-->
 <section>
 <title>Zebra MapReduce Examples</title> 
  
- 
-
- 
  <!-- ZEBRA OUTPUT EXAMPLE-->
 <section>
 <title>Table Output Format</title>

Modified: hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml?rev=896204&r1=896203&r2=896204&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml (original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_pig.xml Tue Jan  5 20:31:12 2010
@@ -142,7 +142,7 @@
 -- Load an existing table (one column is projected)
 
 C = LOAD '$PATH/tbl4' USING org.apache.hadoop.zebra.pig.TableLoader('c2');   
-    </source>
+</source>
    </section>
    <!--end example: simple types-->
     
@@ -177,11 +177,32 @@
 -- Load an existing table (two columns are projected)
 
 B = LOAD '$PATH/tbl3' USING org.apache.hadoop.zebra.pig.TableLoader('s1, r1');    
-    </source>
+</source>
    </section>    
    <!--end example: complex types--> 
+
+
+       <!--example: HDFS Globs-->
+   <section>
+    <title>HDFS File Globs</title>
+        <p>Pig supports HDFS file globs 
+    (for more information about globs, see <a href="http://hadoop.apache.org/common/docs/current/api/org/apache/hadoop/fs/FileSystem.html">FileSystem</a> and GlobStatus).</p>
+    <p>In this example, all Zebra tables in the directory of /path/to/PIG/tables will be loaded as a union (table union). </p>
+ <source>
+ A = LOAD ‘/path/to/PIG/tables/*’ USING org.apache.hadoop.zebra.pig.TableLoader(‘’);
+</source>
+    
+<p>In this example, three Zebra tables of t1, t2 and t3 in /path/to/PIG/tables will be loaded as a union (table union). Note that the ordering of the three tables in the union may not necessarily be t1 followed by t2 followed by t3 as you would expect if you specified ‘/path/to/PIG/tables/t1, /path/to/PIG/tables/t2, /path/to/PIG/tables/t3’. Instead, the ordering is determined by the ordering the HDFS glob expansion generates, namely, the <em>string ordering</em> of the expanded paths.
+</p>
+
+ <source>
+ A = LOAD ‘/path/to/PIG/tables/{t1, t2, t3}’ USING org.apache.hadoop.zebra.pig.TableLoader(‘’);
+</source>
+   <p></p>
+    </section>    
+   <!--end example: HDFS Globs-->
     </section>
-  <!-- END ZEBRA EXAMPLES-->    
+<!-- END ZEBRA EXAMPLES-->    
   
  </body>
  </document>

Modified: hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml?rev=896204&r1=896203&r2=896204&view=diff
==============================================================================
--- hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml (original)
+++ hadoop/pig/trunk/src/docs/src/documentation/content/xdocs/zebra_reference.xml Tue Jan  5 20:31:12 2010
@@ -144,8 +144,8 @@
  
    <!-- STORE SCHEMA SPECIFICATION -->
    <section>
-   <title>Store Schema Specification</title>
-<p>The Zebra store schema is used to write or store Zebra columns and to specify column types. 
+   <title>Store Schema</title>
+<p>Use the Zebra store schema to write or store Zebra columns and to specify column types. 
 The schema  supports data type compatibility and conversion between Zebra/Pig, Zebra/MapReduce, and Zebra/Streaming.
 (<em>In a future release, the schema will also support type compatibility between Zebra/Pig-SQL and will guide the underlying serialization formats provided by Avro for projection, filtering, and so on. </em>)
 </p>   
@@ -361,24 +361,23 @@
    <section>
    <title>Storage Specification</title>
    
- <p> The Zebra storage specification is used to specify column groups and the columns in those groups. 
-</p>
- 
- <p>
- The storage specification describes the physical structure of a Zebra table where $PATH/tbl indicates the table_directory and column_group_name
- indicates the subdirectories within the table_directory. The STORE statement below indicates the following: 
+ <p> Use the Zebra storage specification to define Zebra column groups. The storage specification, when combined with a STORE statement, describes the physical structure of a Zebra table. Suppose we have the following statement:
  </p>
- <ul>
-		<li>$PATH/mytable - directory</li>
-		<li>$PATH/mytable/A - subdirectory </li>
-		<li>$PATH/mytable/A/part00001 </li>
-		<li><em>etc ...</em></li>
- </ul>
-
+ 
 <source>
-STORE A INTO '$PATH/mytable' USING org.apache.hadoop.zebra.pig.TableStorer('[a1, a2] AS A; [a3, a4, a5] AS B');
+STORE A INTO '$PATH/mytable' USING org.apache.hadoop.zebra.pig.TableStorer('[a1, a2] AS cg1; [a3, a4, a5] AS cg2');
 </source>
 
+<p>The statement describes a table that has two column groups; the first column group has two columns, the second column group has three columns. The statement can be interpreted as follows:</p>
+
+ <ul>
+		<li>$PATH/mytable - the table, a file path to a directory named mytable</li>
+		<li>$PATH/mytable/cg1 - the first column group, a subdirectory named cg1 under directory mytable </li>
+		<li>$PATH/mytable/cg1/part00001 - a file consisting, conceptually, of columns a1 and a2</li>
+	    <li>$PATH/mytable/cg2 - the second column group, a subdirectory named cg2 under directory mytable </li>
+		<li>$PATH/mytable/cg2/part00001 - a file consisting, conceptually, of columns a3, a4, and a5</li>
+ </ul>
+
   <section>
    <title>Specification</title>
     <p>  The basic format for the Zebra storage specification is shown here. 
@@ -448,9 +447,9 @@
    
    <!-- LOAD SCHEMA SPECIFICATION -->
    <section>
-   <title>Load Schema Specification</title>
+   <title>Load Schema</title>
    
-   <p>The Zebra load schema is load or read table columns. </p>
+   <p>Use the Zebra load schema to load or read table columns.</p>
      <section>
    <title>Schema</title>
    <p>The basic format for the Zebra load (read) schema is shown here. The column name can be any valid Zebra type.  
@@ -490,209 +489,6 @@
    </section>
    </section>
    <!-- END LOAD SCHEMA SPECIFICATION -->   
-
-   <!-- ZEBRA API-->
-   <section>
-   <title>Zebra MapReduce Interfaces</title>
-    <p>Zebra includes several classes for use in MapReduce programs. The main entry point into Zebra are the two classes for reading and writing tables, namely TableInputFormat and BasicTableOutputFormat. </p>
-
-    	<section>
-         <title>BasicTableOutputFormat  </title>    	
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>yes</td>
-			<td>void setOutputPath(JobConf, Path)  </td>
-			<td>Set the output path of the BasicTable in JobConf  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>Path[] getOutputPaths(JobConf) </td>
-			<td>Get the output paths of the BasicTable from JobConf </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>void setStorageInfo(JobConf, ZebraSchema, ZebraStorageHint, ZebraSortInfo) </td>
-			<td>Set the table storage information (schema, storagehint, sortinfo) in JobConf</td>
-		</tr>
-			<tr>
-			<td>yes</td>
-			<td>Schema getSchema(JobConf)  </td>
-			<td>Get the table schema in JobConf  </td>
-		</tr>
-	    <tr>
-			<td>yes</td>
-			<td>BytesWritable generateSortKey(JobConf, Tuple)  </td>
-			<td>Generates a BytesWritable key for the input key </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>String getStorageHint(JobConf)  </td>
-			<td>Get the table storage hint in JobConf  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>SortInfo getSortInfo(JobConf)  </td>
-			<td>Get the SortInfo object  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>void close(JobConf)  </td>
-			<td>Close the output BasicTable, No more rows can be added into the table  </td>
-		</tr>
-	   <tr>
-			<td>yes</td>
-			<td>void setMultipleOutputs(JobConf, String commaSeparatedLocs, Class &lt; extends ZebraOutputPartition&gt; theClass)  </td>
-			<td>Enables data to be written to multiple zebra tables based on the ZebraOutputPartition class. 
-			See <a href="zebra_mapreduce.html#Multiple+Table+Outputs">Multiple Table Outputs.</a></td>
-		</tr>
-    	</table> 
-        </section>
-        
-          	<section>
-         <title>TableInputFormat   </title>    
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>yes</td>
-			<td>void setInputPaths(JobConf, Path... paths)  </td>
-			<td>Set the paths to the input table </td>
-
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>Path[] getInputPaths(JobConf)  </td>
-			<td>Get the comma-separated paths to the input table or table union  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>Schema getSchema(JobConf)  </td>
-			<td>Get the schema of a table expr  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>void setProjection(JobConf, ZebraProjection)  </td>
-			<td>Set the input projection in the JobConf object  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>String getProjection(JobConf) </td>
-			<td>Get the projection from the JobConf </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>SortInfo getSortInfo(JobConf)  </td>
-			<td>Get the SortInfo object regarding a Zebra table  </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>void requireSortedTable(JobConf, String sortcolumns, BytesComparator comparator) </td>
-			<td>Requires sorted table or table union </td>
-		</tr>
-	    <tr>
-			<td> yes </td>
-			<td>TableRecordReader getTableRecordReader(JobConf, ZebraProjection) </td>
-			<td>Get a TableRecordReader on a single split </td>
-		</tr>
-		<tr>
-			<td>yes</td>
-			<td>void setMinSplitSize(JobConf, long minSize) </td>
-			<td>Set the minimum split size, default of 1M bytes </td>
-		</tr>
-    	</table>
-    	</section>
-
-    <section>
-    <title>TableRecordReader </title>
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>no</td>
-			<td>boolean seekTo(BytesWritable key) </td>
-			<td>Seek to the position at the first row which has the key (returning true) or just after the key(returning false); only applicable for sorted Zebra table.  </td>
-		</tr>
-    	</table>
-     </section>
-     
-     
-     
-     <section>
-         <title>ZebraOutputPartition </title>
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td> no </td>
-			<td>public abstract int getOutputPartition(BytesWritable key, Tuple value) </td>
-			<td>Abstract method from ZebraOutputPartition abstract class. App implements this to stream data to different table  </td>
-		</tr>
-		<tr>
-			<td> no </td>
-			<td>void setConf(Configuration jobConf)  </td>
-			<td>Initialization routine giving JobConf to application. Zebra implements it  </td>
-		</tr>
-		<tr>
-			<td> no </td>
-			<td>Configuration getConf()  </td>
-			<td> returns JobConf. Zebra implements it</td>
-		</tr>
-		<tr>
-			<td>yes </td>
-			<td>Class&lt; extends ZebraOutputPartition&gt; getZebraOutputPartitionClass(JobConf conf) </td>
-			<td>return user implemented ZebraOutputPartition class  </td>
-		</tr>
-    	</table>
-   	   </section>
-   	   
-   	   
-    <section>
-    <title>ZebraProjection </title>
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>yes</td>
-			<td>ZebraProjection createZebraProjection(String) </td>
-			<td>Create a ZebraProjection object from a string representing projection information. </td>
-		</tr>
-    	</table>
-     </section>
-     
-    <section>
-    <title>ZebraSchema</title>
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>yes</td>
-			<td>ZebraSchema createZebraSchema(String)  </td>
-			<td>Create a ZebraStorageHint object from a string representing storage hint information.</td>
-		</tr>
-    	</table>
-     </section>     
-     
-    <section>
-    <title>ZebraStorageHint </title>
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>yes</td>
-			<td>ZebraStorageHint createZebraStorageHint(String) </td>
-			<td>Create a ZebraStorageHint object from a string representing storage hint information. </td>
-		</tr>
-    	</table>
-     </section>   
-     
-    <section>
-    <title>ZebraSortInfo </title>
-    	<table>
-		<tr><th>Static</th><th>Method</th><th>Description</th></tr>
-		<tr>
-			<td>yes</td>
-			<td>ZebraSortInfo createZebraSortInfo(String sortColumns, Class&lt; extends RawComparator &lt; Object &gt;&gt; comparatorClass) </td>
-			<td>Create a ZebraSortInfo object from a sort columns string and a comparator class. </td>
-		</tr>
-    	</table>
-     </section>   
-         </section>
- <!-- END ZEBRA API--> 
     
  </body>
  </document>