You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2010/04/15 01:44:17 UTC

svn commit: r934242 [1/2] - in /hadoop/pig/trunk: ./ src/ src/org/apache/pig/ src/org/apache/pig/classification/

Author: gates
Date: Wed Apr 14 23:44:16 2010
New Revision: 934242

URL: http://svn.apache.org/viewvc?rev=934242&view=rev
Log:
PIG-1370: Marking Pig interface for org.apache.pig package.

Added:
    hadoop/pig/trunk/src/org/apache/pig/classification/
    hadoop/pig/trunk/src/org/apache/pig/classification/InterfaceAudience.java
    hadoop/pig/trunk/src/org/apache/pig/classification/InterfaceStability.java
Modified:
    hadoop/pig/trunk/CHANGES.txt
    hadoop/pig/trunk/src/org/apache/pig/Accumulator.java
    hadoop/pig/trunk/src/org/apache/pig/Algebraic.java
    hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java
    hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java
    hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java
    hadoop/pig/trunk/src/org/apache/pig/Expression.java
    hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java
    hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java
    hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java
    hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java
    hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java
    hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java
    hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java
    hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java
    hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java
    hadoop/pig/trunk/src/org/apache/pig/Main.java
    hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java
    hadoop/pig/trunk/src/org/apache/pig/PigServer.java
    hadoop/pig/trunk/src/org/apache/pig/PigToStream.java
    hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java
    hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java
    hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java
    hadoop/pig/trunk/src/org/apache/pig/SortInfo.java
    hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java
    hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java
    hadoop/pig/trunk/src/org/apache/pig/StoreFuncInterface.java
    hadoop/pig/trunk/src/org/apache/pig/StoreMetadata.java
    hadoop/pig/trunk/src/org/apache/pig/StreamToPig.java
    hadoop/pig/trunk/src/overview.html

Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Wed Apr 14 23:44:16 2010
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
 
 IMPROVEMENTS
 
+PIG-1370: Marking Pig interface for org.apache.pig package (gates)
+
 PIG-1354: UDFs for dynamic invocation of simple Java methods (dvryaboy)
 
 PIG-1316: TextLoader should use Bzip2TextInputFormat for bzip files so that

Modified: hadoop/pig/trunk/src/org/apache/pig/Accumulator.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Accumulator.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Accumulator.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Accumulator.java Wed Apr 14 23:44:16 2010
@@ -19,12 +19,27 @@ package org.apache.pig;
 
 import java.io.IOException;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 
+/**
+ * An interface that allows UDFs that take a bag to accumulate tuples in chunks rather than take
+ * the whole set at once.  This is intended for UDFs that do not need to see all of the tuples
+ * together but cannot be used with the combiner.  This lowers the memory needs, avoiding the need
+ * to spill large bags, and thus speeds up the query.  An example is something like session analysis.
+ * It cannot be used with the combiner because all it's inputs must first be ordered.  But it does
+ * not need to see all the tuples at once.  UDF implementors might also choose to implement this
+ * interface so that if other UDFs in the FOREACH implement it it can be used.
+ * @since Pig 0.6
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public interface Accumulator <T> {
     /**
-     * Pass tuples to the UDF.  You can retrive DataBag by calling b.get(index). 
-     * Each DataBag may contain 0 to many tuples for current key
+     * Pass tuples to the UDF.
+     * @param b A tuple containing a single field, which is a bag.  The bag will contain the set
+     * of tuples being passed to the UDF in this iteration.
      */
     public void accumulate(Tuple b) throws IOException;
 

Modified: hadoop/pig/trunk/src/org/apache/pig/Algebraic.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Algebraic.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Algebraic.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Algebraic.java Wed Apr 14 23:44:16 2010
@@ -15,42 +15,51 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.pig;
-
-/**
+package org.apache.pig;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
+/**
  * An interface to declare that an EvalFunc's 
  * calculation can be decomposed into intitial, intermediate, and final steps.
- * More formally, suppose we have to compute an function f over a bag X. In general, we need to know the entire X
- * before we can make any progress on f. However, some functions are <i>algebraic</i> e.g. SUM. In
- * these cases, you can apply some initital function f_init on subsets of X to get partial results. 
- * You can then combine partial results from different subsets of X using an intermediate function
- * f_intermed. To get the final answers, several partial results can be combined by invoking a final
- * f_final function. For the function SUM, f_init, f_intermed, and f_final are all SUM. 
- * 
- * See the code for builtin AVG to get a better idea of how algebraic works.
- * 
- * When eval functions implement this interface, it is a hint to the system to try and compute
- * partial results early which causes queries to run faster. 
- *
- */
-public interface Algebraic{
-    
-    /**
-     * 
-     * @return A string to instatiate f_init. f_init should be an eval func 
-     */
-    public String getInitial();
-
-    /**
-     * 
-     * @return A string to instantiate f_intermed. f_intermed should be an eval func
-     */
-    public String getIntermed();
-
-    /**
-     * 
-     * @return A string to instantiate f_final. f_final should be an eval func parametrized by
-     * the same datum as the eval func implementing this interface
-     */
-    public String getFinal();
-}
+ * More formally, suppose we have to compute an function f over a bag X. In general, we need to know the entire X
+ * before we can make any progress on f. However, some functions are <i>algebraic</i> e.g. SUM. In
+ * these cases, you can apply some initital function f_init on subsets of X to get partial results. 
+ * You can then combine partial results from different subsets of X using an intermediate function
+ * f_intermed. To get the final answers, several partial results can be combined by invoking a final
+ * f_final function. For the function SUM, f_init, f_intermed, and f_final are all SUM. 
+ * 
+ * See the code for builtin AVG to get a better idea of how algebraic works.
+ * 
+ * When eval functions implement this interface, Pig will attempt to use MapReduce's combiner.
+ * The initial funciton will be called in the map phase and be passed a single tuple.  The
+ * intermediate function will be called 0 or more times in the combiner phase.  And the final
+ * function will be called once in the reduce phase.  It is important that the results be the same
+ * whether the intermediate function is called 0, 1, or more times.  Hadoop makes no guarantees
+ * about how many times the combiner will be called in a job.
+ *
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
+public interface Algebraic{
+    
+    /**
+     * Get the initial function. 
+     * @return A function name of f_init. f_init should be an eval func.
+     */
+    public String getInitial();
+
+    /**
+     * Get the intermediate function. 
+     * @return A function name of f_intermed. f_intermed should be an eval func.
+     */
+    public String getIntermed();
+
+    /**
+     * Get the final function. 
+     * @return A function name of f_final. f_final should be an eval func parametrized by
+     * the same datum as the eval func implementing this interface.
+     */
+    public String getFinal();
+}

Modified: hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -19,18 +19,23 @@ package org.apache.pig;
 
 import java.io.IOException;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
 /**
- * This interface implemented by {@link LoadFunc} implementations indicates to 
+ * This interface implemented by a {@link LoadFunc} implementations indicates to 
  * Pig that it has the capability to load data such that all instances of a key 
  * will occur in same split.
  * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
 public interface CollectableLoadFunc {
 
     /**
-     * When this method is called, Pig is communicating to Loader that it must
+     * When this method is called, Pig is communicating to the Loader that it must
      * load data such that all instances of a key are in same split. Pig will
-     * make no further checks at runtime to ensure whether contract is honored
+     * make no further checks at runtime to ensure whether the contract is honored
      * or not.
      * @throws IOException
      */

Modified: hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java Wed Apr 14 23:44:16 2010
@@ -21,12 +21,21 @@ import java.io.IOException;
 
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.io.WritableComparator;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PigProgressable;
 import org.apache.pig.impl.io.NullableTuple;
 
 
+/**
+ * An interface for custom order by comparator function.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
+@Deprecated
 public abstract class ComparisonFunc extends WritableComparator {
     // If the comparison is a time consuming process
     // this reporter must be used to report progress
@@ -36,6 +45,14 @@ public abstract class ComparisonFunc ext
         super(NullableTuple.class, true);
     }
 
+    /**
+     * Compare two tuples.  Note that even though both args are given type of
+     * WritableComparable to match the WritableComparable interface, they
+     * must both be tuples.
+     * @param a first tuple
+     * @param b tuple to compare a to
+     * @return -1 if a &lt; b, 1 if a &gt; b, 0 if a = b
+     */
     public int compare(WritableComparable a, WritableComparable b) {
         // The incoming key will be in a NullableTuple.  But the comparison
         // function needs a tuple, so pull the tuple out.
@@ -44,19 +61,25 @@ public abstract class ComparisonFunc ext
 
     /**
      * This callback method must be implemented by all subclasses. Compares 
-     * its two arguments for order. Returns a negative integer, zero, or a 
-     * positive integer as the first argument is less than, equal to, or 
-     * greater than the second. The order of elements of the tuples correspond 
+     * its two arguments for order.  The order of elements of the tuples correspond 
      * to the fields specified in the order by clause. 
      * Same semantics as {@link java.util.Comparator}.
      * 
      * @param t1 the first Tuple to be compared.
      * @param t2 the second Tuple to be compared.
+     * @return  Returns a negative integer, zero, or a positive integer as the first
+     * argument is less than, equal to, or greater than the second. 
      * @throws IOException
      * @see java.util.Comparator
      */
     abstract public int compare(Tuple t1, Tuple t2);
 
+    /**
+     * Set the reporter.  If the comparison takes a long time the
+     * reporter should be called occasionally to avoid Hadoop timing out
+     * underneath.  The default Hadoop timeout is 600 seconds.
+     * @param reporter Progress reporter
+     */
     public void setReporter(PigProgressable reporter) {
         this.reporter = reporter;
     }

Modified: hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java Wed Apr 14 23:44:16 2010
@@ -26,6 +26,8 @@ import java.util.List;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.impl.PigContext;
 import org.apache.pig.impl.logicalLayer.FrontendException;
@@ -36,19 +38,35 @@ import org.apache.pig.backend.hadoop.exe
 
 /**
  * The class is used to implement functions to be applied to
- * a dataset. The function is applied to each Tuple in the set.
+ * fields in a dataset. The function is applied to each Tuple in the set.
  * The programmer should not make assumptions about state maintained
- * between invocations of the invoke() method since the Pig runtime
+ * between invocations of the exec() method since the Pig runtime
  * will schedule and localize invocations based on information provided
  * at runtime.  The programmer also should not make assumptions about when or
  * how many times the class will be instantiated, since it may be instantiated
  * multiple times in both the front and back end.
  */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public abstract class EvalFunc<T>  {
-    // UDFs must use this to report progress
-    // if the exec is taking more that 300 ms
+    /**
+     * Reporter to send heartbeats to Hadoop.  If exec will take more than a
+     * a few seconds {@link PigProgressable#progress} should be called
+     * occasionally to avoid timeouts.  Default Hadoop timeout is 600 seconds.
+     */
     protected PigProgressable reporter;
+
+    /**
+     * Logging object.  Log calls made on the front end will be sent to
+     * pig's log on the client.  Log calls made on the backend will be
+     * sent to stdout and can be seen in the Hadoop logs.
+     */
     protected Log log = LogFactory.getLog(getClass());
+
+    /**
+     * Logger for aggregating warnings.  Any warnings to be sent to the user
+     * should be logged to this via {@link PigLogger#warn}.
+     */
     protected PigLogger pigLogger;
 
     private static int nextSchemaId; // for assigning unique ids to UDF columns
@@ -62,6 +80,9 @@ public abstract class EvalFunc<T>  {
         return alias;
     }
     
+    /**
+     * Return type of this instance of EvalFunc.
+     */
     protected Type returnType;
     
     public EvalFunc(){
@@ -113,16 +134,31 @@ public abstract class EvalFunc<T>  {
         }
     }
     
+    /**
+     * Get the Type that this EvalFunc returns.
+     * @return Type
+     */
     public Type getReturnType(){
         return returnType;
     }
         
     // report that progress is being made (otherwise hadoop times out after 600 seconds working on one outer tuple)
+    /**
+     * Utility method to allow UDF to report progress.  If exec will take more than a
+     * a few seconds {@link PigProgressable#progress} should be called
+     * occasionally to avoid timeouts.  Default Hadoop timeout is 600 seconds.
+     */
     public final void progress() {
         if (reporter != null) reporter.progress();
         else warn("No reporter object provided to UDF.", PigWarning.PROGRESS_REPORTER_NOT_PROVIDED);
     }
     
+    /**
+     * Issue a warning.  Warning messages are aggregated and reported to
+     * the user.
+     * @param msg String message of the warning
+     * @param warningEnum type of warning
+     */
     public final void warn(String msg, Enum warningEnum) {
     	if(pigLogger != null) pigLogger.warn(this, msg, warningEnum);
     	else log.warn("No logger object provided to UDF: " + this.getClass().getName() + ". " + msg);
@@ -130,7 +166,7 @@ public abstract class EvalFunc<T>  {
 
     /**
      * Placeholder for cleanup to be performed at the end. User defined functions can override.
-     *
+     * Default implementation is a no-op.
      */
     public void finish(){}
     
@@ -150,6 +186,9 @@ public abstract class EvalFunc<T>  {
     abstract public T exec(Tuple input) throws IOException;
     
     /**
+     * Report the schema of the output of this UDF.  Pig will make use of
+     * this in error checking, optimization, and planning.  The schema
+     * of input data to this UDF is provided.
      * @param input Schema of the input
      * @return Schema of the output
      */
@@ -163,6 +202,7 @@ public abstract class EvalFunc<T>  {
      * asynchronously.
      * @return true if the function can be executed asynchronously.
      */
+    @Deprecated
     public boolean isAsynchronous(){
         return false;
     }
@@ -173,13 +213,32 @@ public abstract class EvalFunc<T>  {
     }
 
 
+    /**
+     * Set the reporter.  Called by Pig to provide a reference of
+     * the reporter to the UDF.
+     * @param reporter Hadoop reporter
+     */
     public final void setReporter(PigProgressable reporter) {
         this.reporter = reporter;
     }
     
     /**
-     * @return A List containing FuncSpec objects representing the Function class
-     * which can handle the inputs corresponding to the schema in the objects
+     * Allow a UDF to specify type specific implementations of itself.  For example,
+     * an implementation of arithmetic sum might have int and float implementations,
+     * since integer arithmetic performs much better than floating point arithmetic.  Pig's
+     * typechecker will call this method and using the returned list plus the schema
+     * of the function's input data, decide which implementation of the UDF to use.
+     * @return A List containing FuncSpec objects representing the EvalFunc class
+     * which can handle the inputs corresponding to the schema in the objects.  Each
+     * FuncSpec should be constructed with a schema that describes the input for that
+     * implementation.  For example, the sum function above would return two elements in its 
+     * list:
+     * <ol>
+     * <li>FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.DOUBLE)))
+     * <li>FuncSpec(IntSum.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.INTEGER)))
+     * </ol>
+     * This would indicate that the main implementation is used for doubles, and the special
+     * implementation IntSum is used for ints.
      */
     public List<FuncSpec> getArgToFuncMapping() throws FrontendException{
         return null;
@@ -189,6 +248,11 @@ public abstract class EvalFunc<T>  {
         return pigLogger;
     }
 
+    /**
+     * Set the PigLogger object.  Called by Pig to provide a reference 
+     * to the UDF.
+     * @param pigLogger PigLogger object.
+     */
     public final void setPigLogger(PigLogger pigLogger) {
         this.pigLogger = pigLogger;
     }

Modified: hadoop/pig/trunk/src/org/apache/pig/Expression.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Expression.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Expression.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Expression.java Wed Apr 14 23:44:16 2010
@@ -17,10 +17,15 @@
  */
 package org.apache.pig;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 
 /**
- * A class to communicate Filter expressions to LoadFuncs
+ * A class to communicate Filter expressions to LoadFuncs.
+ * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
 public abstract class Expression {
 
  // Operator type                                                                                                                                                                                                                                                                                       

Modified: hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -22,11 +22,17 @@ import org.apache.hadoop.io.WritableComp
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
 /**
  * This class provides an implementation of OrderedLoadFunc interface
  * which can be optionally re-used by LoadFuncs that use FileInputFormat, by
  * having this as a super class
+ * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Since we haven't done outer join for merge join yet
 public abstract class FileInputLoadFunc extends LoadFunc implements OrderedLoadFunc  {
     
     @Override

Modified: hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java Wed Apr 14 23:44:16 2010
@@ -23,14 +23,18 @@ import java.io.IOException;
 import java.io.Serializable;
 
 import org.apache.hadoop.io.WritableComparable;
-import org.apache.pig.data.DataReaderWriter;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+import org.apache.pig.data.DataReaderWriter;
 
 /**
- * This class can be used to represent a relative position in a file. 
- * compareTo(other) function of WritaleComparable interface is used to compare
- * position of different objects of this class.
+ * This class represents a relative position in a file.  It records a filename
+ * and an offset.  This allows Pig to order FileSplits.
+ * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Since we haven't done outer join for merge join yet
 public class FileSplitComparable implements WritableComparable<FileSplitComparable>, Serializable{
 
     private static final long serialVersionUID = 1L;
@@ -111,4 +115,4 @@ public class FileSplitComparable impleme
             return false;
         return true;
     }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java Wed Apr 14 23:44:16 2010
@@ -19,14 +19,18 @@ package org.apache.pig;
 
 import java.io.IOException;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 
 
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public abstract class FilterFunc extends EvalFunc<Boolean> {
     
     /**
      * Placeholder for cleanup to be performed at the end. User defined functions can override.
-     *
+     * Default implementation is a no-op.
      */
     public void finish(){}
 }

Modified: hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java Wed Apr 14 23:44:16 2010
@@ -23,12 +23,16 @@ import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 
 /**
- * Class to represent a UDF specification - essentially 
- * encapsulates the class name and the arguments to the constructor
+ * Class to represent a UDF specification.
+ * Encapsulates the class name and the arguments to the constructor.
  */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public class FuncSpec implements Serializable, Cloneable {
 
     private static final long serialVersionUID = 2L;
@@ -38,7 +42,8 @@ public class FuncSpec implements Seriali
    
     /**
      * @param className the name of the class for the udf
-     * @param ctorArg the argument for the constructor for the above class
+     * @param ctorArg the argument to pass the constructor for the above class.
+     * Constructors can only take strings.
      */
     public FuncSpec(String className, String ctorArg) {
         this.className = className;
@@ -48,7 +53,8 @@ public class FuncSpec implements Seriali
 
     /**
      * @param className the name of the class for the udf
-     * @param ctorArgs the arguments for the constructor for the above class
+     * @param ctorArgs the arguments to pass to the constructor for the above class.
+     * Constructors can only take strings.
      */
     public FuncSpec(String className, String[] ctorArgs) {
         this.className = className;
@@ -57,7 +63,8 @@ public class FuncSpec implements Seriali
     
     /**
      * @param className the name of the class for the udf
-     * @param ctorArgs the arguments for the constructor for the above class
+     * @param ctorArgs the arguments to pass to the constructor for the above class.
+     * Constructors can only take strings.
      * @param inputArgsSchema schema for input args taken by this Function
      */
     public FuncSpec(String className, String[] ctorArgs, Schema inputArgsSchema) {
@@ -94,6 +101,10 @@ public class FuncSpec implements Seriali
         this.inputArgsSchema = inputArgsSchema;
     }
     
+    /**
+     * Parse the class name out of a function specification string.
+     * @return name of the class.
+     */
     public static String getClassNameFromSpec(String funcSpec){
         int paren = funcSpec.indexOf('(');
         if (paren!=-1)
@@ -102,6 +113,10 @@ public class FuncSpec implements Seriali
             return funcSpec;
     }
  
+    /**
+     * Get the argument values passed to the func spec.
+     * @return argument values.  Format will be arg1, arg2, ... )
+     */
     public static String getArgStringFromSpec(String funcSpec){
         int paren = funcSpec.indexOf('(');
         if (paren!=-1)
@@ -111,7 +126,7 @@ public class FuncSpec implements Seriali
     }
     
     /**
-     * Function to parse the arguments from a function specification argument list
+     * Parse the argument values out of a function specification string.
      * @param argString should be of the form "'arg1', 'arg2', ..."
      * @return List of the different argument strings
      */

Modified: hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -20,36 +20,46 @@ package org.apache.pig;
 import java.io.IOException;
 
 import org.apache.hadoop.conf.Configuration;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 
 /**
  * This class is intended for use by LoadFunc implementations
  * which have an internal index for sorted data and can use the index
- * to support merge join in pig. Interaction with the index 
- * is abstracted away by the methods in this interface which the pig
+ * to support merge join in Pig. Interaction with the index 
+ * is abstracted away by the methods in this interface which the Pig
  * runtime will call in a particular sequence to get the records it
  * needs to perform the merge based join.
  * 
- * The sequence of calls made from the pig runtime are:
- * {@link LoadFunc#setUDFContextSignature(String)}
- * {@link IndexableLoadFunc#initialize(Configuration)}
- * {@link LoadFunc#setLocation(String, org.apache.hadoop.mapreduce.Job)}
- * {@link IndexableLoadFunc#seekNear(Tuple)}
- * A series of IndexableLoadFunc.getNext(); calls to perform the join
- * IndexableLoadFunc.close(); 
+ * The sequence of calls made from the Pig runtime are:
+ * <ol>
+ * <li>{@link LoadFunc#setUDFContextSignature(String)}
+ * <li>{@link IndexableLoadFunc#initialize(Configuration)}
+ * <li>{@link LoadFunc#setLocation(String, org.apache.hadoop.mapreduce.Job)}
+ * <li>{@link IndexableLoadFunc#seekNear(Tuple)}
+ * <li>{@link LoadFunc#getNext} called multiple times to retrieve data and perform the join
+ * <li>{@link IndexableLoadFunc#close}
+ * </ol>
+ * @since Pig 0.6
  * 
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Set to evolving because we don't have this working with outer join
+                             // yet, and we may need to change it some for that.
 public interface IndexableLoadFunc {
     
     /**
-     * This method is called by pig run time to allow the
+     * This method is called by Pig run time to allow the
      * IndexableLoadFunc to perform any initialization actions
      * @param conf The job configuration object
+     * @throws IOException
      */
     public abstract void initialize(Configuration conf) throws IOException;
 
     /**
-     * This method is called by the pig runtime to indicate
+     * This method is called by the Pig runtime to indicate
      * to the LoadFunc to position its underlying input stream
      * near the keys supplied as the argument. Specifically:
      * 1) if the keys are present in the input stream, the loadfunc
@@ -86,10 +96,10 @@ public interface IndexableLoadFunc {
     
     
     /**
-     * A method called by the pig runtime to give an opportunity
+     * A method called by the Pig runtime to give an opportunity
      * for implementations to perform cleanup actions like closing
      * the underlying input stream. This is necessary since while
-     * performing a join the pig run time may determine than no further
+     * performing a join the Pig run time may determine than no further
      * join is possible with remaining records and may indicate to the
      * IndexableLoader to cleanup by calling this method.
      * 

Modified: hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java Wed Apr 14 23:44:16 2010
@@ -1,5 +1,4 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
+/* Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
@@ -21,65 +20,74 @@ import java.io.IOException;
 import java.util.Map;
 
 import org.apache.pig.ResourceSchema.ResourceFieldSchema;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.Tuple;
 
 /**
- *
+ * An interface that provides cast implementations for load functions.  For casts between
+ * bytearray objects and internal types, Pig relies on the load function that loaded the data to
+ * provide the cast.  This is because Pig does not understand the binary representation of the
+ * data and thus cannot cast it.  This interface provides functions to cast from bytearray to each
+ * of Pig's internal types.
+ * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Because we still don't have the map casts quite right
 public interface LoadCaster {
 
     /**
-     * Cast data from bytes to long value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to long value.  
+     * @param b bytearray to be cast.
      * @return Long value.
      * @throws IOException if the value cannot be cast.
      */
     public Long bytesToLong(byte[] b) throws IOException;
 
     /**
-     * Cast data from bytes to float value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to float value.  
+     * @param b bytearray to be cast.
      * @return Float value.
      * @throws IOException if the value cannot be cast.
      */
     public Float bytesToFloat(byte[] b) throws IOException;
 
     /**
-     * Cast data from bytes to double value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to double value.  
+     * @param b bytearray to be cast.
      * @return Double value.
      * @throws IOException if the value cannot be cast.
      */
     public Double bytesToDouble(byte[] b) throws IOException;
 
     /**
-     * Cast data from bytes to integer value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to integer value.  
+     * @param b bytearray to be cast.
      * @return Double value.
      * @throws IOException if the value cannot be cast.
      */
     public Integer bytesToInteger(byte[] b) throws IOException;
     
     /**
-     * Cast data from bytes to chararray value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to chararray value.  
+     * @param b bytearray to be cast.
      * @return String value.
      * @throws IOException if the value cannot be cast.
      */
     public String bytesToCharArray(byte[] b) throws IOException;
 
     /**
-     * Cast data from bytes to map value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to map value.  
+     * @param b bytearray to be cast.
      * @return Map value.
      * @throws IOException if the value cannot be cast.
      */
     public Map<String, Object> bytesToMap(byte[] b) throws IOException;
 
     /**
-     * Cast data from bytes to tuple value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to tuple value.  
+     * @param b bytearray to be cast.
      * @param fieldSchema field schema for the output tuple
      * @return Tuple value.
      * @throws IOException if the value cannot be cast.
@@ -87,8 +95,8 @@ public interface LoadCaster {
     public Tuple bytesToTuple(byte[] b, ResourceFieldSchema fieldSchema) throws IOException;
 
     /**
-     * Cast data from bytes to bag value.  
-     * @param b byte array to be cast.
+     * Cast data from bytearray to bag value.  
+     * @param b bytearray to be cast.
      * @param fieldSchema field schema for the output bag
      * @return Bag value.
      * @throws IOException if the value cannot be cast.

Modified: hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java Wed Apr 14 23:44:16 2010
@@ -28,6 +28,9 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.RecordReader;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.LoadPushDown.RequiredFieldList;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
 import org.apache.pig.builtin.Utf8StorageConverter;
@@ -37,9 +40,18 @@ import org.apache.pig.impl.util.UDFConte
 
 
 /**
- * <code>LoadFunc</code> provides functions directly associated with reading 
- * records from data set.
+ * A LoadFunc loads data into Pig.  It can read from an HDFS file or other source.
+ * LoadFunc is tightly coupled to Hadoop's {@link org.apache.hadoop.mapreduce.InputFormat}.
+ * LoadFunc's sit atop an InputFormat and translate from the keys and values of Hadoop
+ * to Pig's tuples.  
+ * <p>
+ * LoadFunc contains the basic features needed by the majority of load functions.  For
+ * more advanced functionality there are separate interfaces that a load function
+ * can implement.  See {@link LoadCaster}, {@link LoadMetadata}, {@link LoadPushDown}, 
+ * {@link OrderedLoadFunc}, {@link CollectableLoadFunc}, and {@link IndexableLoadFunc}.
  */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public abstract class LoadFunc {
     
     /**

Modified: hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java Wed Apr 14 23:44:16 2010
@@ -22,11 +22,17 @@ import java.io.IOException;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.mapreduce.Job;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
 /**
  * This interface defines how to retrieve metadata related to data to be loaded.
  * If a given loader does not implement this interface, it will be assumed that it
  * is unable to provide metadata about the associated data.
+ * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
 public interface LoadMetadata {
 
     /**
@@ -80,7 +86,7 @@ public interface LoadMetadata {
      * will only contain references to fields given as partition keys in
      * getPartitionKeys. So if the implementation returns null in 
      * {@link #getPartitionKeys(String, Job)}, then this method is not
-     * called by pig runtime. This method is also not called by the pig runtime
+     * called by Pig runtime. This method is also not called by the Pig runtime
      * if there are no partition filter conditions. 
      * @param partitionFilter that describes filter for partitioning
      * @throws IOException if the filter is not compatible with the storage

Modified: hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java Wed Apr 14 23:44:16 2010
@@ -21,13 +21,18 @@ import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.impl.logicalLayer.FrontendException;
 
 /**
  * This interface defines how to communicate to Pig what functionality can
  * be pushed into the loader.  If a given loader does not implement this interface
  * it will be assumed that it is unable to accept any functionality for push down.
+ * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
 public interface LoadPushDown {
 
     /**
@@ -54,10 +59,17 @@ public interface LoadPushDown {
      * the input are required. If the loader function cannot make use of this 
      * information, it is free to ignore it by returning an appropriate Response
      * @param requiredFieldList RequiredFieldList indicating which columns will be needed.
+     * @return Indicates which fields will be returned
+     * @throws FrontendException
      */
     public RequiredFieldResponse pushProjection(RequiredFieldList 
             requiredFieldList) throws FrontendException;
     
+    /**
+     * Describes a field that is required to execute a scripts.
+     */
+    @InterfaceAudience.Public
+    @InterfaceStability.Evolving
     public static class RequiredField implements Serializable {
         
         private static final long serialVersionUID = 1L;
@@ -152,6 +164,11 @@ public interface LoadPushDown {
         }
     }
 
+    /**
+     * List of fields that Pig knows to be required to executed a script.
+     */
+    @InterfaceAudience.Public
+    @InterfaceStability.Evolving
     public static class RequiredFieldList implements Serializable {
         
         private static final long serialVersionUID = 1L;
@@ -160,6 +177,7 @@ public interface LoadPushDown {
         private List<RequiredField> fields = new ArrayList<RequiredField>(); 
         
         /**
+         * Set the list of required fields.
          * @param fields
          */
         public RequiredFieldList(List<RequiredField> fields) {
@@ -167,6 +185,7 @@ public interface LoadPushDown {
         }
 
         /**
+         * Geta ll required fields as a list.
          * @return the required fields - this will be null if all fields are
          *         required
          */
@@ -194,12 +213,21 @@ public interface LoadPushDown {
             return result.toString();
         }
         
+        /**
+         * Add a field to the list of required fields.
+         * @param rf required field to add to the list.
+         */
         public void add(RequiredField rf)
         {
             fields.add(rf);
         }
     }
 
+    /**
+     * Indicates whether the loader will return the requested fields or all fields.
+     */
+    @InterfaceAudience.Public
+    @InterfaceStability.Evolving
     public static class RequiredFieldResponse {
         // the loader should pass true if it will return data containing
         // only the List of RequiredFields in that order. false if it
@@ -210,20 +238,24 @@ public interface LoadPushDown {
             this.requiredFieldRequestHonored = requiredFieldRequestHonored;
         }
 
-        // true if the loader will return data containing only the List of
-        // RequiredFields in that order. false if the loader will return all
-        // fields in the data
+        /**
+         * Indicates whether the loader will return only the requested fields or all fields.
+         * @return true if only requested fields will be returned, false if all fields will be
+         * returned.
+         */
         public boolean getRequiredFieldResponse() {
             return requiredFieldRequestHonored;
         }
 
-        // the loader should pass true if the it will return data containing
-        // only the List of RequiredFields in that order. false if the it
-        // will return all fields in the data
+        /**
+         * Set whether the loader will return only the requesetd fields or all fields.
+         * @param honored if true only requested fields will be returned, else all fields will be
+         * returned.
+         */
         public void setRequiredFieldResponse(boolean honored) {
             requiredFieldRequestHonored = honored;
         }
     }
 
     
-}
\ No newline at end of file
+}

Modified: hadoop/pig/trunk/src/org/apache/pig/Main.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Main.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Main.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Main.java Wed Apr 14 23:44:16 2010
@@ -17,9 +17,24 @@
  */
 package org.apache.pig;
 
-import java.io.*;
-import java.util.*;
-import java.util.jar.*;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.jar.Attributes;
+import java.util.jar.JarFile;
+import java.util.jar.Manifest;
 import java.text.ParseException;
 
 import jline.ConsoleReader;
@@ -32,6 +47,9 @@ import org.apache.commons.logging.LogFac
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.log4j.PropertyConfigurator;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.ExecType;
 import org.apache.pig.impl.PigContext;
 import org.apache.pig.impl.io.FileLocalizer;
@@ -46,6 +64,11 @@ import org.apache.pig.impl.util.LogUtils
 import org.apache.pig.tools.timer.PerformanceTimerFactory;
 import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
 
+/**
+ * Main class for Pig engine.
+ */
+@InterfaceAudience.LimitedPrivate({"Oozie"})
+@InterfaceStability.Stable
 public class Main
 {
 
@@ -61,7 +84,7 @@ public class Main
                 
 /**
  * The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate
- * for executing Jar files.
+ * for executing Jar files.  Warning, this method calls System.exit().
  * 
  * @param args
  *            -jar can be used to add additional jar files (colon separated). - will start a
@@ -542,6 +565,9 @@ private static String getVersionString()
       } 
 }
 
+/**
+ * Print usage string.
+ */
 public static void usage()
 {
 	System.out.println("\n"+getVersionString()+"\n");

Modified: hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -23,21 +23,31 @@ import java.io.IOException;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.InputSplit;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 
 /**
  * Implementing this interface indicates to Pig that a given loader
- * can be used for MergeJoin. The position as represented by the
+ * can be used for MergeJoin. It does not mean the data itself is ordered,
+ * but rather that the splits returned by the underlying InputFormat
+ * can be ordered to match the order of the data they are loading.  For 
+ * example, files splits have a natural order (that of the file they are
+ * from) while splits of RDBMS does not (since tables have no inherent order).
+ * The position as represented by the
  * WritableComparable object is stored in the index created by
- *  MergeJoin sampling MR job to get an ordered sequence of splits.
- * This is necessary when the sort key spans multiple splits.
+ * a MergeJoin sampling MapReduce job to get an ordered sequence of splits.
+ * It is necessary to read splits in order during a merge join to assure 
+ * data is being read according to the sort order.
  * @since Pig 0.7
  */
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Since we haven't done outer join for merge join yet
 public interface OrderedLoadFunc {
 
     /**
      * The WritableComparable object returned will be used to compare
      * the position of different splits in an ordered stream
-     * @param split
+     * @param split An InputSplit from the InputFormat underlying this loader.
      * @return WritableComparable representing the position of the split in input
      * @throws IOException
      */

Modified: hadoop/pig/trunk/src/org/apache/pig/PigServer.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/PigServer.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/PigServer.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/PigServer.java Wed Apr 14 23:44:16 2010
@@ -37,19 +37,26 @@ import java.util.Set;
 import java.util.Stack;
 
 import org.apache.hadoop.mapreduce.Job;
+
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
-import org.apache.pig.impl.plan.PlanException;
+
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+
 import org.apache.pig.backend.datastorage.ContainerDescriptor;
 import org.apache.pig.backend.datastorage.DataStorage;
 import org.apache.pig.backend.datastorage.ElementDescriptor;
 import org.apache.pig.backend.executionengine.ExecException;
 import org.apache.pig.backend.executionengine.ExecJob;
 import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS;
+import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
 import org.apache.pig.builtin.BinStorage;
 import org.apache.pig.builtin.PigStorage;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.DataBag;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.experimental.logical.LogicalPlanMigrationVistor;
@@ -75,9 +82,7 @@ import org.apache.pig.impl.logicalLayer.
 import org.apache.pig.impl.logicalLayer.parser.QueryParser;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.validators.LogicalPlanValidationExecutor;
-import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
-import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
-import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
+import org.apache.pig.impl.plan.PlanException;
 import org.apache.pig.impl.plan.CompilationMessageCollector;
 import org.apache.pig.impl.plan.DependencyOrderWalker;
 import org.apache.pig.impl.plan.DepthFirstWalker;
@@ -97,17 +102,24 @@ import org.apache.pig.tools.grunt.GruntP
 
 /**
  * 
- * This class is the program's connection to Pig. Typically a program will create a PigServer
+ * A class for Java programs to connect to Pig. Typically a program will create a PigServer
  * instance. The programmer then registers queries using registerQuery() and
  * retrieves results using openIterator() or store(). After doing so, the
  * shutdown() method should be called to free any resources used by the current
  * PigServer instance. Not doing so could result in a memory leak.
  * 
  */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public class PigServer {
     
     private final Log log = LogFactory.getLog(getClass());
     
+    /**
+     * Given a string, determine the exec type.
+     * @param str accepted values are 'local', 'mapreduce', and 'mapred'
+     * @return exectype as ExecType
+     */
     public static ExecType parseExecType(String str) throws IOException {
         String normStr = str.toLowerCase();
         
@@ -159,10 +171,23 @@ public class PigServer {
         return ""+(++scopeCounter);
     }
     
+    /**
+     * @param execTypeString can be 'mapreduce' or 'local'.  Local mode will 
+     * use Hadoop's local job runner to execute the job on the local machine.
+     * Mapreduce mode will connect to a cluster to execute the job.
+     * @throws ExecException
+     * @throws IOException
+     */
     public PigServer(String execTypeString) throws ExecException, IOException {
         this(parseExecType(execTypeString));
     }
     
+    /**
+     * @param execType execution type to start the engine.  Local mode will 
+     * use Hadoop's local job runner to execute the job on the local machine.
+     * Mapreduce mode will connect to a cluster to execute the job.
+     * @throws ExecException
+     */
     public PigServer(ExecType execType) throws ExecException {
         this(execType, PropertiesUtil.loadPropertiesFromFile());
     }
@@ -215,16 +240,26 @@ public class PigServer {
         return pigContext;
     }
     
+    /**
+     * Set the logging level to DEBUG.
+     */
     public void debugOn() {
         Logger.getLogger("org.apache.pig").setLevel(Level.DEBUG);
         pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", Level.DEBUG.toString());
     }
     
+    /**
+     * Set the logging level to the default.
+     */
     public void debugOff() {
         Logger.getLogger("org.apache.pig").setLevel(pigContext.getDefaultLogLevel());
         pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", pigContext.getDefaultLogLevel().toString());
     }
     
+    /**
+     * Set the default parallelism for this job
+     * @param p default number of reducers to use for this job.
+     */
     public void setDefaultParallel(int p) {
         pigContext.defaultParallel = p;
     }
@@ -272,6 +307,7 @@ public class PigServer {
     /**
      * Submits a batch of Pig commands for execution. 
      * 
+     * @return list of jobs being executed
      * @throws FrontendException
      * @throws ExecException
      */
@@ -294,7 +330,6 @@ public class PigServer {
      * Discards a batch of Pig commands.
      * 
      * @throws FrontendException
-     * @throws ExecException
      */
     public void discardBatch() throws FrontendException {
         if (currDAG == null || !isBatchOn()) {
@@ -324,8 +359,8 @@ public class PigServer {
      * @param function - the new function alias to define.
      * @param functionSpec - the name of the function and any arguments.
      * It should have the form: classname('arg1', 'arg2', ...)
+     * @deprecated Use {@link #registerFunction(String, FuncSpec)}
      */
-    @Deprecated
     public void registerFunction(String function, String functionSpec) {
         registerFunction(function, new FuncSpec(functionSpec));
     }
@@ -420,7 +455,7 @@ public class PigServer {
      * @param query
      *            a Pig Latin expression to be evaluated.
      * @param startLine
-     *            line number of the query within the whold script
+     *            line number of the query within the whole script
      * @throws IOException
      */    
     public void registerQuery(String query, int startLine) throws IOException {            
@@ -439,10 +474,24 @@ public class PigServer {
         return graph.getPlan(alias);
     }
     
+    /**
+     * Register a query with the Pig runtime. The query is parsed and registered, but it is not
+     * executed until it is needed.  Equivalent to calling {@link #registerQuery(String, int)}
+     * with startLine set to 1.
+     * 
+     * @param query
+     *            a Pig Latin expression to be evaluated.
+     * @throws IOException
+     */    
     public void registerQuery(String query) throws IOException {
         registerQuery(query, 1);
     }
     
+    /**
+     * Register a query with the Pig runtime.  The query will be read from the indicated file.
+     * @param fileName file to read query from.
+     * @throws IOException
+     */
     public void registerScript(String fileName) throws IOException {
         try {
             GruntParser grunt = new GruntParser(new FileReader(new File(fileName)));
@@ -460,10 +509,22 @@ public class PigServer {
         }
     }
 
+    /**
+     * Intended to be used by unit tests only.
+     * Print a list of all aliases in in the current Pig Latin script.  Output is written to
+     * System.out.
+     * @throws FrontendException
+     */
     public void printAliases () throws FrontendException {
         System.out.println("aliases: " + currDAG.getAliasOp().keySet());
     }
 
+    /**
+     * Write the schema for an alias to System.out.
+     * @param alias Alias whose schema will be written out
+     * @return Schema of alias dumped
+     * @throws IOException
+     */
     public Schema dumpSchema(String alias) throws IOException{
         try {
             LogicalPlan lp = getPlanFromAlias(alias, "describe");
@@ -479,17 +540,42 @@ public class PigServer {
         }
     }
 
+    /**
+     * Set the name of the job.  This name will get translated to mapred.job.name.
+     * @param name of job
+     */
     public void setJobName(String name){
         currDAG.setJobName(name);
     }
     
+    /**
+     * Set Hadoop job priority.  This value will get translated to mapred.job.priority.
+     * @param priority valid values are found in {@link org.apache.hadoop.mapred.JobPriority}
+     */
     public void setJobPriority(String priority){
         currDAG.setJobPriority(priority);
     }
 
     /**
-     * Forces execution of query (and all queries from which it reads), in order to materialize
-     * result
+     * Executes a Pig Latin script up to and including indicated alias.  That is, if a user does:
+     * <pre>
+     * PigServer server = new PigServer();
+     * server.registerQuery("A = load 'foo';");
+     * server.registerQuery("B = filter A by $0 &gt; 0;");
+     * server.registerQuery("C = order B by $1;");
+     * </pre>
+     * Then
+     * <pre>
+     * server.openIterator("B");
+     * </pre>
+     * filtered but unsorted data will be returned.  If instead a user does
+     * <pre>
+     * server.openIterator("C");
+     * </pre>
+     * filtered and sorted data will be returned.
+     * @param id Alias to open iterator for
+     * @return iterator of tuples returned from the script
+     * @throws IOException
      */
     public Iterator<Tuple> openIterator(String id) throws IOException {
         try {
@@ -525,19 +611,61 @@ public class PigServer {
     }
     
     /**
-     * Store an alias into a file
+     * Executes a Pig Latin script up to and including indicated alias and stores the resulting
+     * records into a file.  That is, if a user does:
+     * <pre>
+     * PigServer server = new PigServer();
+     * server.registerQuery("A = load 'foo';");
+     * server.registerQuery("B = filter A by $0 &gt; 0;");
+     * server.registerQuery("C = order B by $1;");
+     * </pre>
+     * Then
+     * <pre>
+     * server.store("B", "bar");
+     * </pre>
+     * filtered but unsorted data will be stored to the file <tt>bar</tt>.  If instead a user does
+     * <pre>
+     * server.store("C", "bar");
+     * </pre>
+     * filtered and sorted data will be stored to the file <tt>bar</tt>.
+     * Equivalent to calling {@link #store(String, String, String)} with
+     * <tt>org.apache.pig.PigStorage</tt> as the store function. 
      * @param id The alias to store
      * @param filename The file to which to store to
+     * @return {@link ExecJob} containing information about this job
      * @throws IOException
      */
-
     public ExecJob store(String id, String filename) throws IOException {
         return store(id, filename, PigStorage.class.getName() + "()");   // SFPig is the default store function
     }
         
     /**
-     *  forces execution of query (and all queries from which it reads), in order to store result in file
-     */        
+     * Executes a Pig Latin script up to and including indicated alias and stores the resulting
+     * records into a file.  That is, if a user does:
+     * <pre>
+     * PigServer server = new PigServer();
+     * server.registerQuery("A = load 'foo';");
+     * server.registerQuery("B = filter A by $0 &gt; 0;");
+     * server.registerQuery("C = order B by $1;");
+     * </pre>
+     * Then
+     * <pre>
+     * server.store("B", "bar", "mystorefunc");
+     * </pre>
+     * filtered but unsorted data will be stored to the file <tt>bar</tt> using
+     * <tt>mystorefunc</tt>.  If instead a user does
+     * <pre>
+     * server.store("C", "bar", "mystorefunc");
+     * </pre>
+     * filtered and sorted data will be stored to the file <tt>bar</tt> using
+     * <tt>mystorefunc</tt>.
+     * <p>
+     * @param id The alias to store
+     * @param filename The file to which to store to
+     * @param func store function to use
+     * @return {@link ExecJob} containing information about this job
+     * @throws IOException
+     */
     public ExecJob store(
             String id,
             String filename,
@@ -597,7 +725,9 @@ public class PigServer {
     /**
      * Provide information on how a pig query will be executed.
      * @param alias Name of alias to explain.
-     * @param format Format in which the explain should be printed
+     * @param format Format in which the explain should be printed.  If text, then the plan will
+     * be printed in plain text.  Otherwise, the execution plan will be printed in
+     * <a href="http://en.wikipedia.org/wiki/DOT_language">DOT</a> format. 
      * @param verbose Controls the amount of information printed
      * @param markAsExecute When set will treat the explain like a
      * call to execute in the respoect that all the pending stores are
@@ -702,28 +832,59 @@ public class PigServer {
         return length * replication;
     }
     
+    /**
+     * Test whether a file exists.
+     * @param filename to test
+     * @return true if file exists, false otherwise
+     * @throws IOException
+     */
     public boolean existsFile(String filename) throws IOException {
         ElementDescriptor elem = pigContext.getDfs().asElement(filename);
         return elem.exists();
     }
     
+    /**
+     * Delete a file.
+     * @param filename to delete
+     * @return true
+     * @throws IOException
+     */
     public boolean deleteFile(String filename) throws IOException {
         ElementDescriptor elem = pigContext.getDfs().asElement(filename);
         elem.delete();
         return true;
     }
     
+    /**
+     * Rename a file.
+     * @param source file to rename
+     * @param target new file name
+     * @return true
+     * @throws IOException
+     */
     public boolean renameFile(String source, String target) throws IOException {
         pigContext.rename(source, target);
         return true;
     }
     
+    /**
+     * Make a directory.
+     * @param dirs directory to make
+     * @return true
+     * @throws IOException
+     */
     public boolean mkdirs(String dirs) throws IOException {
         ContainerDescriptor container = pigContext.getDfs().asContainer(dirs);
         container.create();
         return true;
     }
     
+    /** 
+     * List the contents of a directory.
+     * @param dir name of directory to list
+     * @return array of strings, one for each file name
+     * @throws IOException
+     */
     public String[] listPaths(String dir) throws IOException {
         Collection<String> allPaths = new ArrayList<String>();
         ContainerDescriptor container = pigContext.getDfs().asContainer(dir);
@@ -738,12 +899,19 @@ public class PigServer {
         return allPaths.toArray(type);
     }
     
+    /**
+     * Does not work at the moment.
+     */
     public long totalHadoopTimeSpent() {
 //      TODO FIX Need to uncomment this with the right logic
 //        return MapReduceLauncher.totalHadoopTimeSpent;
         return 0L;
     }
   
+    /**
+     * Return a map containing the logical plan associated with each alias.
+     * @return map
+     */
     public Map<String, LogicalPlan> getAliases() {
         Map<String, LogicalPlan> aliasPlans = new HashMap<String, LogicalPlan>();
         for(LogicalOperator op:  currDAG.getAliases().keySet()) {
@@ -771,6 +939,10 @@ public class PigServer {
         FileLocalizer.deleteTempFiles();
     }
 
+    /**
+     * Get the set of all current aliases.
+     * @return set
+     */
     public Set<String> getAliasKeySet() {
         return currDAG.getAliasOp().keySet();
     }

Modified: hadoop/pig/trunk/src/org/apache/pig/PigToStream.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/PigToStream.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/PigToStream.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/PigToStream.java Wed Apr 14 23:44:16 2010
@@ -19,6 +19,8 @@ package org.apache.pig;
 
 import java.io.IOException;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 
 /**
@@ -29,16 +31,21 @@ import org.apache.pig.data.Tuple;
  * a common protocol for data exchange between Pig runtime and streaming 
  * executables.
  * 
- * Typically, user implements this interface for a particular type of 
+ * Typically, a user implements this interface for a particular type of 
  * stream command and specifies the implementation class in the Pig DEFINE
  * statement. 
- * 
+ * @since Pig 0.7 
  */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public interface PigToStream {
     
     /**
      * Given a tuple, produce an array of bytes to be passed to the streaming
      * executable.
+     * @param t Tuple to serialize
+     * @return Serialized form of the tuple
+     * @throws IOException
      */
     public byte[] serialize(Tuple t) throws IOException;
 

Modified: hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java Wed Apr 14 23:44:16 2010
@@ -25,11 +25,21 @@ import java.util.List;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.DataType;
 import org.apache.pig.impl.logicalLayer.FrontendException;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
 
+/**
+ * A represenation of a schema used to communicate with load and store functions.  This is
+ * separate from {@link Schema}, which is an internal Pig representation of a schema.
+ * @since Pig 0.7
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public class ResourceSchema implements Serializable {
 
     private static final long serialVersionUID = 1L;
@@ -65,10 +75,17 @@ public class ResourceSchema implements S
         // nested tuples and bags will have their own schema
         private ResourceSchema schema; 
 
+        /**
+         * Construct an empty field schema.
+         */
         public ResourceFieldSchema() {
             
         }
         
+        /**
+         * Construct using a {@link org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema} as the template.
+         * @param fieldSchema fieldSchema to copy from
+         */
         public ResourceFieldSchema(FieldSchema fieldSchema) {
             type = fieldSchema.type;
             name = fieldSchema.alias;
@@ -90,36 +107,73 @@ public class ResourceSchema implements S
             }
         }
         
+        /**
+         * Get the name of this field.
+         * @return name
+         */
         public String getName() {
             return name;
         }
+
+        /**
+         * Set the name of this filed.
+         * @param name new name
+         * @return this
+         */
         public ResourceFieldSchema setName(String name) {
             this.name = name;
             return this;
         }
         
+        /**
+         * Get the type of this field.
+         * @return type, as a {@link DataType} static final byte 
+         */
         public byte getType() {
             return type;
         }
     
+        /**
+         * Set the type of this field
+         * @param type new type
+         * @return this
+         */
         public ResourceFieldSchema setType(byte type) {
             this.type = type;
             return this;
         }
         
+        /**
+         * Get a free form text description of this field.
+         * @return description
+         */
         public String getDescription() {
             return description;
         }
      
+        /**
+         * Set the description
+         * @param description new description
+         * @return this
+         */
         public ResourceFieldSchema setDescription(String description) {
             this.description = description;
             return this;
         }
 
+        /**
+         * Get the schema for this field.  Only fields of type tuple should have a schema.
+         * @return schema
+         */
         public ResourceSchema getSchema() {
             return schema;
         }
 
+        /**
+         * Set the schema for this field.  Only fields of type tuple should have a schema.
+         * @param schema new schema
+         * @return this
+         */
         public ResourceFieldSchema setSchema(ResourceSchema schema) throws 
         IOException {
             validateSchema(schema);
@@ -127,9 +181,6 @@ public class ResourceSchema implements S
             return this;
         }
                 
-        /**
-         * @param schema
-         */
         private void validateSchema(ResourceSchema schema) throws IOException {
             if(type == DataType.BAG && schema != null) {
                 ResourceFieldSchema[] subFields = schema.getFields();
@@ -173,10 +224,17 @@ public class ResourceSchema implements S
     }
 
 
+    /**
+     * Construct an empty ResourceSchema.
+     */
     public ResourceSchema() {
         
     }
     
+    /**
+     * Construct a ResourceSchema from a {@link Schema}
+     * @param pigSchema Schema to use
+     */
     public ResourceSchema(Schema pigSchema) {
         List<FieldSchema> pigSchemaFields = pigSchema.getFields();
         fields = new ResourceFieldSchema[pigSchemaFields.size()];
@@ -185,6 +243,13 @@ public class ResourceSchema implements S
         }        
     }
     
+    /**
+     * Only for use by Pig internal code.
+     * Construct a ResourceSchema from a {@link Schema}
+     * @param pigSchema Schema to use
+     * @param sortInfo information on how data is sorted
+     */
+    @InterfaceAudience.Private
     public ResourceSchema(Schema pigSchema, SortInfo sortInfo) {
         this(pigSchema);
         if (sortInfo!=null && sortInfo.getSortColInfoList().size()!=0) {
@@ -206,6 +271,10 @@ public class ResourceSchema implements S
         }
     }
     
+    /**
+     * Get the version of this schema.
+     * @return version
+     */
     public int getVersion() {
         return version;
     }
@@ -215,10 +284,18 @@ public class ResourceSchema implements S
         return this;
     }
     
+    /**
+     * Get field schema for each field
+     * @return array of field schemas.
+     */
     public ResourceFieldSchema[] getFields() {
         return fields;
     }
     
+    /**
+     * Get all field names.
+     * @return array of field names
+     */
     public String[] fieldNames() {
         String[] names = new String[fields.length];
         for (int i=0; i<fields.length; i++) {
@@ -227,32 +304,73 @@ public class ResourceSchema implements S
         return names;
     }
     
+    /**
+     * Set all the fields.  If fields are not currently null the new fields will be silently
+     * ignored.
+     * @param fields to use as fields in this schema
+     * @return this
+     */
     public ResourceSchema setFields(ResourceFieldSchema[] fields) {
         if (fields != null)
             this.fields = Arrays.copyOf(fields, fields.length);
         return this;
     }
     
+    /**
+     * Get the sort keys for this data.
+     * @return array of ints.  Each integer in the array represents the field number.  So if the
+     * schema of the data is (a, b, c, d) and the data is sorted on c, b, the returned sort keys
+     * will be [2, 1].  Field numbers are zero based.  If the data is not sorted a zero length
+     * array will be returned.
+     */
     public int[] getSortKeys() {
         return sortKeys;
     }
     
+    /**
+     * Set the sort keys for htis data.  If sort keys are not currently null the new sort keys
+     * will be silently ignored.
+     * @param sortKeys  Each integer in the array represents the field number.  So if the
+     * schema of the data is (a, b, c, d) and the data is sorted on c, b, the sort keys
+     * should be [2, 1].  Field numbers are zero based.
+     * @return this
+     */
     public  ResourceSchema setSortKeys(int[] sortKeys) {
         if (sortKeys != null)
             this.sortKeys = Arrays.copyOf(sortKeys, sortKeys.length);
         return this;
     }
     
+    /**
+     * Get order for sort keys.
+     * @return array of Order.  This array will be the same length as the int[] array returned by
+     * {@link #getSortKeys}.
+     */
     public Order[] getSortKeyOrders() {
         return sortKeyOrders;
     }
     
+    /**
+     * Set the order for each sort key.  If order is not currently null, new order will be
+     * silently ignored.
+     * @param sortKeyOrders array of Order.  Should be the same length as int[] passed to 
+     * {@link #setSortKeys}.
+     * @return this
+     */
     public ResourceSchema setSortKeyOrders(Order[] sortKeyOrders) {
         if (sortKeyOrders != null) 
             this.sortKeyOrders = Arrays.copyOf(sortKeyOrders, sortKeyOrders.length);
         return this;
     } 
             
+    /**
+     * Test whether two ResourceSchemas are the same.  Two schemas are said to be the same if they
+     * are both null or 
+     * have the same number of fields and for each field the name, type are the same.  For fields
+     * that have may have schemas (i.e. tuples) both schemas be equal.  Field
+     * descriptions are not used in testing equality.
+     * @return true if equal according to the above definition, otherwise false.
+     */
     public static boolean equals(ResourceSchema rs1, ResourceSchema rs2) {
         if (rs1 == null) {
             return rs2 == null ? true : false;

Modified: hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java Wed Apr 14 23:44:16 2010
@@ -20,6 +20,17 @@ package org.apache.pig;
 import java.io.Serializable;
 import java.util.Arrays;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
+/**
+ * An class that represents statistics about data to be loaded or stored.  It is marked unstable
+ * because Pig does very little statistics collection at this point.  If and when that
+ * functionality is added it is expected that this interface will change.
+ * @since Pig 0.7
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
 public class ResourceStatistics implements Cloneable {
 
     /* Getters intentionally return mutable arrays instead of copies,
@@ -38,6 +49,9 @@ public class ResourceStatistics implemen
     public Long avgRecordSize;
     public ResourceFieldStatistics[] fields = new ResourceFieldStatistics[0];
 
+    /**
+     * Statistics for a given field in the data.
+     */
     public static class ResourceFieldStatistics implements Serializable {
 
         public static final long serialVersionUID = 1L;
@@ -46,28 +60,34 @@ public class ResourceStatistics implemen
 
         public Long numDistinctValues;  // number of distinct values represented in this field
 
-        // We need some way to represent a histogram of values in the field,
-        // as those will be useful.  However, we can't count on being
-        // able to hold such histograms in memory.  Have to figure out
-        // how they can be kept on disk and represented here.
-
-        // for now.. don't create so many buckets you can't hold them in memory
-        
-        // an ordered array of the most common values, 
-        // in descending order of frequency
+        /**
+         * We need some way to represent a histogram of values in the field,
+         * as those will be useful.  However, we can't count on being
+         * able to hold such histograms in memory.  Have to figure out
+         * how they can be kept on disk and represented here.
+         *
+         * for now.. don't create so many buckets you can't hold them in memory
+         *
+         * an ordered array of the most common values, 
+         * in descending order of frequency
+         */
         public Object[] mostCommonValues = new Object[0];
         
-        // an array that matches the mostCommonValues array, and lists
-        // the frequencies of those values as a fraction (0 through 1) of
-        // the total number of records
+        /**
+         * an array that matches the mostCommonValues array, and lists
+         * the frequencies of those values as a fraction (0 through 1) of
+         * the total number of records
+         */
         public float[] mostCommonValuesFreq = new float[0];
         
-        // an ordered array of values, from min val to max val
-        // such that the number of records with values 
-        // between valueHistogram[i] and and valueHistogram[i+1] is
-        // roughly equal for all values of i.
-        // NOTE: if mostCommonValues is non-empty, the values in that array
-        // should not be included in the histogram. Adjust accordingly.
+        /**
+         * an ordered array of values, from min val to max val
+         * such that the number of records with values 
+         * between valueHistogram[i] and and valueHistogram[i+1] is
+         * roughly equal for all values of i.
+         * NOTE: if mostCommonValues is non-empty, the values in that array
+         * should not be included in the histogram. Adjust accordingly.
+         */
         public Object[] valueHistogram = new Object[0];
 
         
@@ -254,4 +274,4 @@ public class ResourceStatistics implemen
     public Object clone() throws CloneNotSupportedException {
         return super.clone();
       }
-}
\ No newline at end of file
+}

Modified: hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java Wed Apr 14 23:44:16 2010
@@ -19,16 +19,17 @@ package org.apache.pig;
 
 import java.io.Serializable;
 
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.impl.util.Utils;
 
 /**
  * A class representing information about a sort column in {@link SortInfo} 
  */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
 public class SortColInfo implements Serializable {
     
-    /**
-     * 
-     */
     private static final long serialVersionUID = 1L;
 
     // name of sort column

Modified: hadoop/pig/trunk/src/org/apache/pig/SortInfo.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/SortInfo.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/SortInfo.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/SortInfo.java Wed Apr 14 23:44:16 2010
@@ -21,12 +21,17 @@ package org.apache.pig;
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.List;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.impl.util.Utils;
 
 /**
  * Class to communicate sort column information based on 
  * order by statment's sort columns and schema
  */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
 public class SortInfo implements Serializable {
     
     /**
@@ -39,14 +44,14 @@ public class SortInfo implements Seriali
     List<SortColInfo> sortColInfoList;
     
     /**
-     * @param sortColInfoList
+     * @param sortColInfoList list of sortColInfo, one for each field in the data
      */
     public SortInfo(List<SortColInfo> sortColInfoList){
         this.sortColInfoList = sortColInfoList;
     }
 
     /**
-     * @return the sortColInfoList
+     * @return the sortColInfoList the list of sortColInfo for this data
      */
     public List<SortColInfo> getSortColInfoList() {
         return new ArrayList<SortColInfo>(sortColInfoList);
@@ -66,7 +71,8 @@ public class SortInfo implements Seriali
     }   
 
     /**
-     * @return the isGloballySorted
+     * @return the isGloballySorted true if the data is globally sorted, false if it is sorted
+     * only within each part file.
      */
     public boolean isGloballySorted() {
         return isGloballySorted;

Modified: hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java Wed Apr 14 23:44:16 2010
@@ -32,6 +32,7 @@ import org.apache.pig.impl.logicalLayer.
 import org.apache.pig.impl.logicalLayer.LogicalPlan;
 import org.apache.pig.impl.logicalLayer.LogicalOperator;
 
+@Deprecated
 public class StandAloneParser {
     
     private static final Log log = LogFactory.getLog(StandAloneParser.class);

Modified: hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java Wed Apr 14 23:44:16 2010
@@ -24,17 +24,19 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.RecordWriter;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.impl.util.UDFContext;
 
 
 /**
-* This abstract class is used to implement functions to write records
-* from a dataset.
-* 
-*
+* StoreFuncs take records from Pig's processing and store them into a data store.  Most frequently
+* this is an HDFS file, but it could also be an HBase instance, RDBMS, etc.
 */
-
+@InterfaceAudience.Public
+@InterfaceStability.Stable
 public abstract class StoreFunc implements StoreFuncInterface {
 
     /**
@@ -51,7 +53,6 @@ public abstract class StoreFunc implemen
      * in the script, this would be the home directory - 
      * <pre>/user/<username> </pre>
      * @return the absolute location based on the arguments passed
-     * @throws IOException 
      * @throws IOException if the conversion is not possible
      */
     @Override
@@ -96,7 +97,8 @@ public abstract class StoreFunc implemen
      * check that a given schema is acceptable to it.  For example, it
      * can check that the correct partition keys are included;
      * a storage function to be written directly to an OutputFormat can
-     * make sure the schema will translate in a well defined way.  
+     * make sure the schema will translate in a well defined way.  Default implementation
+     * is a no-op.
      * @param s to be checked
      * @throws IOException if this schema is not acceptable.  It should include
      * a detailed error message indicating what is wrong with the schema.
@@ -108,15 +110,14 @@ public abstract class StoreFunc implemen
 
     /**
      * Initialize StoreFunc to write data.  This will be called during
-     * execution before the call to putNext.
+     * execution on the backend before the call to putNext.
      * @param writer RecordWriter to use.
      * @throws IOException if an exception occurs during initialization
      */
     public abstract void prepareToWrite(RecordWriter writer) throws IOException;
 
     /**
-     * Write a tuple the output stream to which this instance was
-     * previously bound.
+     * Write a tuple to the data store.
      * 
      * @param t the tuple to store.
      * @throws IOException if an exception occurs during the write
@@ -128,7 +129,10 @@ public abstract class StoreFunc implemen
      * pass a unique signature to the {@link StoreFunc} which it can use to store
      * information in the {@link UDFContext} which it needs to store between
      * various method invocations in the front end and back end. This method 
-     * will be called before other methods in {@link StoreFunc}.
+     * will be called before other methods in {@link StoreFunc}.  This is necessary
+     * because in a Pig Latin script with multiple stores, the different
+     * instances of store functions need to be able to find their (and only their)
+     * data in the UDFContext object.  The default implementation is a no-op.
      * @param signature a unique signature to identify this StoreFunc
      */
     @Override
@@ -140,7 +144,7 @@ public abstract class StoreFunc implemen
      * This method will be called by Pig if the job which contains this store
      * fails. Implementations can clean up output locations in this method to
      * ensure that no incorrect/incomplete results are left in the output location.
-     * The implementation in {@link StoreFunc} deletes the output location if it
+     * The default implementation  deletes the output location if it
      * is a {@link FileSystem} location.
      * @param location Location returned by 
      * {@link StoreFunc#relToAbsPathForStoreLocation(String, Path)}
@@ -155,9 +159,10 @@ public abstract class StoreFunc implemen
     }
     
     /**
-     * Implementation for {@link #cleanupOnFailure(String, Job)}
-     * @param location
-     * @param job
+     * Implementation for {@link #cleanupOnFailure(String, Job)}.  This removes a file
+     * from HDFS.
+     * @param location file name (or URI) of file to remove
+     * @param job Hadoop job, used to access the appropriate file system.
      * @throws IOException
      */
     public static void cleanupOnFailureImpl(String location, Job job)