You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ga...@apache.org on 2010/04/15 01:44:17 UTC
svn commit: r934242 [1/2] - in /hadoop/pig/trunk: ./ src/
src/org/apache/pig/ src/org/apache/pig/classification/
Author: gates
Date: Wed Apr 14 23:44:16 2010
New Revision: 934242
URL: http://svn.apache.org/viewvc?rev=934242&view=rev
Log:
PIG-1370: Marking Pig interface for org.apache.pig package.
Added:
hadoop/pig/trunk/src/org/apache/pig/classification/
hadoop/pig/trunk/src/org/apache/pig/classification/InterfaceAudience.java
hadoop/pig/trunk/src/org/apache/pig/classification/InterfaceStability.java
Modified:
hadoop/pig/trunk/CHANGES.txt
hadoop/pig/trunk/src/org/apache/pig/Accumulator.java
hadoop/pig/trunk/src/org/apache/pig/Algebraic.java
hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java
hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java
hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java
hadoop/pig/trunk/src/org/apache/pig/Expression.java
hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java
hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java
hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java
hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java
hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java
hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java
hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java
hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java
hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java
hadoop/pig/trunk/src/org/apache/pig/Main.java
hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java
hadoop/pig/trunk/src/org/apache/pig/PigServer.java
hadoop/pig/trunk/src/org/apache/pig/PigToStream.java
hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java
hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java
hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java
hadoop/pig/trunk/src/org/apache/pig/SortInfo.java
hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java
hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java
hadoop/pig/trunk/src/org/apache/pig/StoreFuncInterface.java
hadoop/pig/trunk/src/org/apache/pig/StoreMetadata.java
hadoop/pig/trunk/src/org/apache/pig/StreamToPig.java
hadoop/pig/trunk/src/overview.html
Modified: hadoop/pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/CHANGES.txt?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/CHANGES.txt (original)
+++ hadoop/pig/trunk/CHANGES.txt Wed Apr 14 23:44:16 2010
@@ -24,6 +24,8 @@ INCOMPATIBLE CHANGES
IMPROVEMENTS
+PIG-1370: Marking Pig interface for org.apache.pig package (gates)
+
PIG-1354: UDFs for dynamic invocation of simple Java methods (dvryaboy)
PIG-1316: TextLoader should use Bzip2TextInputFormat for bzip files so that
Modified: hadoop/pig/trunk/src/org/apache/pig/Accumulator.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Accumulator.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Accumulator.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Accumulator.java Wed Apr 14 23:44:16 2010
@@ -19,12 +19,27 @@ package org.apache.pig;
import java.io.IOException;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
+/**
+ * An interface that allows UDFs that take a bag to accumulate tuples in chunks rather than take
+ * the whole set at once. This is intended for UDFs that do not need to see all of the tuples
+ * together but cannot be used with the combiner. This lowers the memory needs, avoiding the need
+ * to spill large bags, and thus speeds up the query. An example is something like session analysis.
+ * It cannot be used with the combiner because all it's inputs must first be ordered. But it does
+ * not need to see all the tuples at once. UDF implementors might also choose to implement this
+ * interface so that if other UDFs in the FOREACH implement it it can be used.
+ * @since Pig 0.6
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public interface Accumulator <T> {
/**
- * Pass tuples to the UDF. You can retrive DataBag by calling b.get(index).
- * Each DataBag may contain 0 to many tuples for current key
+ * Pass tuples to the UDF.
+ * @param b A tuple containing a single field, which is a bag. The bag will contain the set
+ * of tuples being passed to the UDF in this iteration.
*/
public void accumulate(Tuple b) throws IOException;
Modified: hadoop/pig/trunk/src/org/apache/pig/Algebraic.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Algebraic.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Algebraic.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Algebraic.java Wed Apr 14 23:44:16 2010
@@ -15,42 +15,51 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.pig;
-
-/**
+package org.apache.pig;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
+/**
* An interface to declare that an EvalFunc's
* calculation can be decomposed into intitial, intermediate, and final steps.
- * More formally, suppose we have to compute an function f over a bag X. In general, we need to know the entire X
- * before we can make any progress on f. However, some functions are <i>algebraic</i> e.g. SUM. In
- * these cases, you can apply some initital function f_init on subsets of X to get partial results.
- * You can then combine partial results from different subsets of X using an intermediate function
- * f_intermed. To get the final answers, several partial results can be combined by invoking a final
- * f_final function. For the function SUM, f_init, f_intermed, and f_final are all SUM.
- *
- * See the code for builtin AVG to get a better idea of how algebraic works.
- *
- * When eval functions implement this interface, it is a hint to the system to try and compute
- * partial results early which causes queries to run faster.
- *
- */
-public interface Algebraic{
-
- /**
- *
- * @return A string to instatiate f_init. f_init should be an eval func
- */
- public String getInitial();
-
- /**
- *
- * @return A string to instantiate f_intermed. f_intermed should be an eval func
- */
- public String getIntermed();
-
- /**
- *
- * @return A string to instantiate f_final. f_final should be an eval func parametrized by
- * the same datum as the eval func implementing this interface
- */
- public String getFinal();
-}
+ * More formally, suppose we have to compute an function f over a bag X. In general, we need to know the entire X
+ * before we can make any progress on f. However, some functions are <i>algebraic</i> e.g. SUM. In
+ * these cases, you can apply some initital function f_init on subsets of X to get partial results.
+ * You can then combine partial results from different subsets of X using an intermediate function
+ * f_intermed. To get the final answers, several partial results can be combined by invoking a final
+ * f_final function. For the function SUM, f_init, f_intermed, and f_final are all SUM.
+ *
+ * See the code for builtin AVG to get a better idea of how algebraic works.
+ *
+ * When eval functions implement this interface, Pig will attempt to use MapReduce's combiner.
+ * The initial funciton will be called in the map phase and be passed a single tuple. The
+ * intermediate function will be called 0 or more times in the combiner phase. And the final
+ * function will be called once in the reduce phase. It is important that the results be the same
+ * whether the intermediate function is called 0, 1, or more times. Hadoop makes no guarantees
+ * about how many times the combiner will be called in a job.
+ *
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
+public interface Algebraic{
+
+ /**
+ * Get the initial function.
+ * @return A function name of f_init. f_init should be an eval func.
+ */
+ public String getInitial();
+
+ /**
+ * Get the intermediate function.
+ * @return A function name of f_intermed. f_intermed should be an eval func.
+ */
+ public String getIntermed();
+
+ /**
+ * Get the final function.
+ * @return A function name of f_final. f_final should be an eval func parametrized by
+ * the same datum as the eval func implementing this interface.
+ */
+ public String getFinal();
+}
Modified: hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/CollectableLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -19,18 +19,23 @@ package org.apache.pig;
import java.io.IOException;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
/**
- * This interface implemented by {@link LoadFunc} implementations indicates to
+ * This interface implemented by a {@link LoadFunc} implementations indicates to
* Pig that it has the capability to load data such that all instances of a key
* will occur in same split.
* @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
public interface CollectableLoadFunc {
/**
- * When this method is called, Pig is communicating to Loader that it must
+ * When this method is called, Pig is communicating to the Loader that it must
* load data such that all instances of a key are in same split. Pig will
- * make no further checks at runtime to ensure whether contract is honored
+ * make no further checks at runtime to ensure whether the contract is honored
* or not.
* @throws IOException
*/
Modified: hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/ComparisonFunc.java Wed Apr 14 23:44:16 2010
@@ -21,12 +21,21 @@ import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PigProgressable;
import org.apache.pig.impl.io.NullableTuple;
+/**
+ * An interface for custom order by comparator function.
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
+@Deprecated
public abstract class ComparisonFunc extends WritableComparator {
// If the comparison is a time consuming process
// this reporter must be used to report progress
@@ -36,6 +45,14 @@ public abstract class ComparisonFunc ext
super(NullableTuple.class, true);
}
+ /**
+ * Compare two tuples. Note that even though both args are given type of
+ * WritableComparable to match the WritableComparable interface, they
+ * must both be tuples.
+ * @param a first tuple
+ * @param b tuple to compare a to
+ * @return -1 if a < b, 1 if a > b, 0 if a = b
+ */
public int compare(WritableComparable a, WritableComparable b) {
// The incoming key will be in a NullableTuple. But the comparison
// function needs a tuple, so pull the tuple out.
@@ -44,19 +61,25 @@ public abstract class ComparisonFunc ext
/**
* This callback method must be implemented by all subclasses. Compares
- * its two arguments for order. Returns a negative integer, zero, or a
- * positive integer as the first argument is less than, equal to, or
- * greater than the second. The order of elements of the tuples correspond
+ * its two arguments for order. The order of elements of the tuples correspond
* to the fields specified in the order by clause.
* Same semantics as {@link java.util.Comparator}.
*
* @param t1 the first Tuple to be compared.
* @param t2 the second Tuple to be compared.
+ * @return Returns a negative integer, zero, or a positive integer as the first
+ * argument is less than, equal to, or greater than the second.
* @throws IOException
* @see java.util.Comparator
*/
abstract public int compare(Tuple t1, Tuple t2);
+ /**
+ * Set the reporter. If the comparison takes a long time the
+ * reporter should be called occasionally to avoid Hadoop timing out
+ * underneath. The default Hadoop timeout is 600 seconds.
+ * @param reporter Progress reporter
+ */
public void setReporter(PigProgressable reporter) {
this.reporter = reporter;
}
Modified: hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/EvalFunc.java Wed Apr 14 23:44:16 2010
@@ -26,6 +26,8 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.logicalLayer.FrontendException;
@@ -36,19 +38,35 @@ import org.apache.pig.backend.hadoop.exe
/**
* The class is used to implement functions to be applied to
- * a dataset. The function is applied to each Tuple in the set.
+ * fields in a dataset. The function is applied to each Tuple in the set.
* The programmer should not make assumptions about state maintained
- * between invocations of the invoke() method since the Pig runtime
+ * between invocations of the exec() method since the Pig runtime
* will schedule and localize invocations based on information provided
* at runtime. The programmer also should not make assumptions about when or
* how many times the class will be instantiated, since it may be instantiated
* multiple times in both the front and back end.
*/
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public abstract class EvalFunc<T> {
- // UDFs must use this to report progress
- // if the exec is taking more that 300 ms
+ /**
+ * Reporter to send heartbeats to Hadoop. If exec will take more than a
+ * a few seconds {@link PigProgressable#progress} should be called
+ * occasionally to avoid timeouts. Default Hadoop timeout is 600 seconds.
+ */
protected PigProgressable reporter;
+
+ /**
+ * Logging object. Log calls made on the front end will be sent to
+ * pig's log on the client. Log calls made on the backend will be
+ * sent to stdout and can be seen in the Hadoop logs.
+ */
protected Log log = LogFactory.getLog(getClass());
+
+ /**
+ * Logger for aggregating warnings. Any warnings to be sent to the user
+ * should be logged to this via {@link PigLogger#warn}.
+ */
protected PigLogger pigLogger;
private static int nextSchemaId; // for assigning unique ids to UDF columns
@@ -62,6 +80,9 @@ public abstract class EvalFunc<T> {
return alias;
}
+ /**
+ * Return type of this instance of EvalFunc.
+ */
protected Type returnType;
public EvalFunc(){
@@ -113,16 +134,31 @@ public abstract class EvalFunc<T> {
}
}
+ /**
+ * Get the Type that this EvalFunc returns.
+ * @return Type
+ */
public Type getReturnType(){
return returnType;
}
// report that progress is being made (otherwise hadoop times out after 600 seconds working on one outer tuple)
+ /**
+ * Utility method to allow UDF to report progress. If exec will take more than a
+ * a few seconds {@link PigProgressable#progress} should be called
+ * occasionally to avoid timeouts. Default Hadoop timeout is 600 seconds.
+ */
public final void progress() {
if (reporter != null) reporter.progress();
else warn("No reporter object provided to UDF.", PigWarning.PROGRESS_REPORTER_NOT_PROVIDED);
}
+ /**
+ * Issue a warning. Warning messages are aggregated and reported to
+ * the user.
+ * @param msg String message of the warning
+ * @param warningEnum type of warning
+ */
public final void warn(String msg, Enum warningEnum) {
if(pigLogger != null) pigLogger.warn(this, msg, warningEnum);
else log.warn("No logger object provided to UDF: " + this.getClass().getName() + ". " + msg);
@@ -130,7 +166,7 @@ public abstract class EvalFunc<T> {
/**
* Placeholder for cleanup to be performed at the end. User defined functions can override.
- *
+ * Default implementation is a no-op.
*/
public void finish(){}
@@ -150,6 +186,9 @@ public abstract class EvalFunc<T> {
abstract public T exec(Tuple input) throws IOException;
/**
+ * Report the schema of the output of this UDF. Pig will make use of
+ * this in error checking, optimization, and planning. The schema
+ * of input data to this UDF is provided.
* @param input Schema of the input
* @return Schema of the output
*/
@@ -163,6 +202,7 @@ public abstract class EvalFunc<T> {
* asynchronously.
* @return true if the function can be executed asynchronously.
*/
+ @Deprecated
public boolean isAsynchronous(){
return false;
}
@@ -173,13 +213,32 @@ public abstract class EvalFunc<T> {
}
+ /**
+ * Set the reporter. Called by Pig to provide a reference of
+ * the reporter to the UDF.
+ * @param reporter Hadoop reporter
+ */
public final void setReporter(PigProgressable reporter) {
this.reporter = reporter;
}
/**
- * @return A List containing FuncSpec objects representing the Function class
- * which can handle the inputs corresponding to the schema in the objects
+ * Allow a UDF to specify type specific implementations of itself. For example,
+ * an implementation of arithmetic sum might have int and float implementations,
+ * since integer arithmetic performs much better than floating point arithmetic. Pig's
+ * typechecker will call this method and using the returned list plus the schema
+ * of the function's input data, decide which implementation of the UDF to use.
+ * @return A List containing FuncSpec objects representing the EvalFunc class
+ * which can handle the inputs corresponding to the schema in the objects. Each
+ * FuncSpec should be constructed with a schema that describes the input for that
+ * implementation. For example, the sum function above would return two elements in its
+ * list:
+ * <ol>
+ * <li>FuncSpec(this.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.DOUBLE)))
+ * <li>FuncSpec(IntSum.getClass().getName(), new Schema(new Schema.FieldSchema(null, DataType.INTEGER)))
+ * </ol>
+ * This would indicate that the main implementation is used for doubles, and the special
+ * implementation IntSum is used for ints.
*/
public List<FuncSpec> getArgToFuncMapping() throws FrontendException{
return null;
@@ -189,6 +248,11 @@ public abstract class EvalFunc<T> {
return pigLogger;
}
+ /**
+ * Set the PigLogger object. Called by Pig to provide a reference
+ * to the UDF.
+ * @param pigLogger PigLogger object.
+ */
public final void setPigLogger(PigLogger pigLogger) {
this.pigLogger = pigLogger;
}
Modified: hadoop/pig/trunk/src/org/apache/pig/Expression.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Expression.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Expression.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Expression.java Wed Apr 14 23:44:16 2010
@@ -17,10 +17,15 @@
*/
package org.apache.pig;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
/**
- * A class to communicate Filter expressions to LoadFuncs
+ * A class to communicate Filter expressions to LoadFuncs.
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
public abstract class Expression {
// Operator type
Modified: hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FileInputLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -22,11 +22,17 @@ import org.apache.hadoop.io.WritableComp
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
/**
* This class provides an implementation of OrderedLoadFunc interface
* which can be optionally re-used by LoadFuncs that use FileInputFormat, by
* having this as a super class
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Since we haven't done outer join for merge join yet
public abstract class FileInputLoadFunc extends LoadFunc implements OrderedLoadFunc {
@Override
Modified: hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FileSplitComparable.java Wed Apr 14 23:44:16 2010
@@ -23,14 +23,18 @@ import java.io.IOException;
import java.io.Serializable;
import org.apache.hadoop.io.WritableComparable;
-import org.apache.pig.data.DataReaderWriter;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+import org.apache.pig.data.DataReaderWriter;
/**
- * This class can be used to represent a relative position in a file.
- * compareTo(other) function of WritaleComparable interface is used to compare
- * position of different objects of this class.
+ * This class represents a relative position in a file. It records a filename
+ * and an offset. This allows Pig to order FileSplits.
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Since we haven't done outer join for merge join yet
public class FileSplitComparable implements WritableComparable<FileSplitComparable>, Serializable{
private static final long serialVersionUID = 1L;
@@ -111,4 +115,4 @@ public class FileSplitComparable impleme
return false;
return true;
}
-}
\ No newline at end of file
+}
Modified: hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FilterFunc.java Wed Apr 14 23:44:16 2010
@@ -19,14 +19,18 @@ package org.apache.pig;
import java.io.IOException;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public abstract class FilterFunc extends EvalFunc<Boolean> {
/**
* Placeholder for cleanup to be performed at the end. User defined functions can override.
- *
+ * Default implementation is a no-op.
*/
public void finish(){}
}
Modified: hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/FuncSpec.java Wed Apr 14 23:44:16 2010
@@ -23,12 +23,16 @@ import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.impl.logicalLayer.schema.Schema;
/**
- * Class to represent a UDF specification - essentially
- * encapsulates the class name and the arguments to the constructor
+ * Class to represent a UDF specification.
+ * Encapsulates the class name and the arguments to the constructor.
*/
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public class FuncSpec implements Serializable, Cloneable {
private static final long serialVersionUID = 2L;
@@ -38,7 +42,8 @@ public class FuncSpec implements Seriali
/**
* @param className the name of the class for the udf
- * @param ctorArg the argument for the constructor for the above class
+ * @param ctorArg the argument to pass the constructor for the above class.
+ * Constructors can only take strings.
*/
public FuncSpec(String className, String ctorArg) {
this.className = className;
@@ -48,7 +53,8 @@ public class FuncSpec implements Seriali
/**
* @param className the name of the class for the udf
- * @param ctorArgs the arguments for the constructor for the above class
+ * @param ctorArgs the arguments to pass to the constructor for the above class.
+ * Constructors can only take strings.
*/
public FuncSpec(String className, String[] ctorArgs) {
this.className = className;
@@ -57,7 +63,8 @@ public class FuncSpec implements Seriali
/**
* @param className the name of the class for the udf
- * @param ctorArgs the arguments for the constructor for the above class
+ * @param ctorArgs the arguments to pass to the constructor for the above class.
+ * Constructors can only take strings.
* @param inputArgsSchema schema for input args taken by this Function
*/
public FuncSpec(String className, String[] ctorArgs, Schema inputArgsSchema) {
@@ -94,6 +101,10 @@ public class FuncSpec implements Seriali
this.inputArgsSchema = inputArgsSchema;
}
+ /**
+ * Parse the class name out of a function specification string.
+ * @return name of the class.
+ */
public static String getClassNameFromSpec(String funcSpec){
int paren = funcSpec.indexOf('(');
if (paren!=-1)
@@ -102,6 +113,10 @@ public class FuncSpec implements Seriali
return funcSpec;
}
+ /**
+ * Get the argument values passed to the func spec.
+ * @return argument values. Format will be arg1, arg2, ... )
+ */
public static String getArgStringFromSpec(String funcSpec){
int paren = funcSpec.indexOf('(');
if (paren!=-1)
@@ -111,7 +126,7 @@ public class FuncSpec implements Seriali
}
/**
- * Function to parse the arguments from a function specification argument list
+ * Parse the argument values out of a function specification string.
* @param argString should be of the form "'arg1', 'arg2', ..."
* @return List of the different argument strings
*/
Modified: hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/IndexableLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -20,36 +20,46 @@ package org.apache.pig;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
/**
* This class is intended for use by LoadFunc implementations
* which have an internal index for sorted data and can use the index
- * to support merge join in pig. Interaction with the index
- * is abstracted away by the methods in this interface which the pig
+ * to support merge join in Pig. Interaction with the index
+ * is abstracted away by the methods in this interface which the Pig
* runtime will call in a particular sequence to get the records it
* needs to perform the merge based join.
*
- * The sequence of calls made from the pig runtime are:
- * {@link LoadFunc#setUDFContextSignature(String)}
- * {@link IndexableLoadFunc#initialize(Configuration)}
- * {@link LoadFunc#setLocation(String, org.apache.hadoop.mapreduce.Job)}
- * {@link IndexableLoadFunc#seekNear(Tuple)}
- * A series of IndexableLoadFunc.getNext(); calls to perform the join
- * IndexableLoadFunc.close();
+ * The sequence of calls made from the Pig runtime are:
+ * <ol>
+ * <li>{@link LoadFunc#setUDFContextSignature(String)}
+ * <li>{@link IndexableLoadFunc#initialize(Configuration)}
+ * <li>{@link LoadFunc#setLocation(String, org.apache.hadoop.mapreduce.Job)}
+ * <li>{@link IndexableLoadFunc#seekNear(Tuple)}
+ * <li>{@link LoadFunc#getNext} called multiple times to retrieve data and perform the join
+ * <li>{@link IndexableLoadFunc#close}
+ * </ol>
+ * @since Pig 0.6
*
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Set to evolving because we don't have this working with outer join
+ // yet, and we may need to change it some for that.
public interface IndexableLoadFunc {
/**
- * This method is called by pig run time to allow the
+ * This method is called by Pig run time to allow the
* IndexableLoadFunc to perform any initialization actions
* @param conf The job configuration object
+ * @throws IOException
*/
public abstract void initialize(Configuration conf) throws IOException;
/**
- * This method is called by the pig runtime to indicate
+ * This method is called by the Pig runtime to indicate
* to the LoadFunc to position its underlying input stream
* near the keys supplied as the argument. Specifically:
* 1) if the keys are present in the input stream, the loadfunc
@@ -86,10 +96,10 @@ public interface IndexableLoadFunc {
/**
- * A method called by the pig runtime to give an opportunity
+ * A method called by the Pig runtime to give an opportunity
* for implementations to perform cleanup actions like closing
* the underlying input stream. This is necessary since while
- * performing a join the pig run time may determine than no further
+ * performing a join the Pig run time may determine than no further
* join is possible with remaining records and may indicate to the
* IndexableLoader to cleanup by calling this method.
*
Modified: hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadCaster.java Wed Apr 14 23:44:16 2010
@@ -1,5 +1,4 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
+/* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
@@ -21,65 +20,74 @@ import java.io.IOException;
import java.util.Map;
import org.apache.pig.ResourceSchema.ResourceFieldSchema;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
/**
- *
+ * An interface that provides cast implementations for load functions. For casts between
+ * bytearray objects and internal types, Pig relies on the load function that loaded the data to
+ * provide the cast. This is because Pig does not understand the binary representation of the
+ * data and thus cannot cast it. This interface provides functions to cast from bytearray to each
+ * of Pig's internal types.
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Because we still don't have the map casts quite right
public interface LoadCaster {
/**
- * Cast data from bytes to long value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to long value.
+ * @param b bytearray to be cast.
* @return Long value.
* @throws IOException if the value cannot be cast.
*/
public Long bytesToLong(byte[] b) throws IOException;
/**
- * Cast data from bytes to float value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to float value.
+ * @param b bytearray to be cast.
* @return Float value.
* @throws IOException if the value cannot be cast.
*/
public Float bytesToFloat(byte[] b) throws IOException;
/**
- * Cast data from bytes to double value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to double value.
+ * @param b bytearray to be cast.
* @return Double value.
* @throws IOException if the value cannot be cast.
*/
public Double bytesToDouble(byte[] b) throws IOException;
/**
- * Cast data from bytes to integer value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to integer value.
+ * @param b bytearray to be cast.
* @return Double value.
* @throws IOException if the value cannot be cast.
*/
public Integer bytesToInteger(byte[] b) throws IOException;
/**
- * Cast data from bytes to chararray value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to chararray value.
+ * @param b bytearray to be cast.
* @return String value.
* @throws IOException if the value cannot be cast.
*/
public String bytesToCharArray(byte[] b) throws IOException;
/**
- * Cast data from bytes to map value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to map value.
+ * @param b bytearray to be cast.
* @return Map value.
* @throws IOException if the value cannot be cast.
*/
public Map<String, Object> bytesToMap(byte[] b) throws IOException;
/**
- * Cast data from bytes to tuple value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to tuple value.
+ * @param b bytearray to be cast.
* @param fieldSchema field schema for the output tuple
* @return Tuple value.
* @throws IOException if the value cannot be cast.
@@ -87,8 +95,8 @@ public interface LoadCaster {
public Tuple bytesToTuple(byte[] b, ResourceFieldSchema fieldSchema) throws IOException;
/**
- * Cast data from bytes to bag value.
- * @param b byte array to be cast.
+ * Cast data from bytearray to bag value.
+ * @param b bytearray to be cast.
* @param fieldSchema field schema for the output bag
* @return Bag value.
* @throws IOException if the value cannot be cast.
Modified: hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadFunc.java Wed Apr 14 23:44:16 2010
@@ -28,6 +28,9 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.RecordReader;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.LoadPushDown.RequiredFieldList;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
import org.apache.pig.builtin.Utf8StorageConverter;
@@ -37,9 +40,18 @@ import org.apache.pig.impl.util.UDFConte
/**
- * <code>LoadFunc</code> provides functions directly associated with reading
- * records from data set.
+ * A LoadFunc loads data into Pig. It can read from an HDFS file or other source.
+ * LoadFunc is tightly coupled to Hadoop's {@link org.apache.hadoop.mapreduce.InputFormat}.
+ * LoadFunc's sit atop an InputFormat and translate from the keys and values of Hadoop
+ * to Pig's tuples.
+ * <p>
+ * LoadFunc contains the basic features needed by the majority of load functions. For
+ * more advanced functionality there are separate interfaces that a load function
+ * can implement. See {@link LoadCaster}, {@link LoadMetadata}, {@link LoadPushDown},
+ * {@link OrderedLoadFunc}, {@link CollectableLoadFunc}, and {@link IndexableLoadFunc}.
*/
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public abstract class LoadFunc {
/**
Modified: hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadMetadata.java Wed Apr 14 23:44:16 2010
@@ -22,11 +22,17 @@ import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
/**
* This interface defines how to retrieve metadata related to data to be loaded.
* If a given loader does not implement this interface, it will be assumed that it
* is unable to provide metadata about the associated data.
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
public interface LoadMetadata {
/**
@@ -80,7 +86,7 @@ public interface LoadMetadata {
* will only contain references to fields given as partition keys in
* getPartitionKeys. So if the implementation returns null in
* {@link #getPartitionKeys(String, Job)}, then this method is not
- * called by pig runtime. This method is also not called by the pig runtime
+ * called by Pig runtime. This method is also not called by the Pig runtime
* if there are no partition filter conditions.
* @param partitionFilter that describes filter for partitioning
* @throws IOException if the filter is not compatible with the storage
Modified: hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/LoadPushDown.java Wed Apr 14 23:44:16 2010
@@ -21,13 +21,18 @@ import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.impl.logicalLayer.FrontendException;
/**
* This interface defines how to communicate to Pig what functionality can
* be pushed into the loader. If a given loader does not implement this interface
* it will be assumed that it is unable to accept any functionality for push down.
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving
public interface LoadPushDown {
/**
@@ -54,10 +59,17 @@ public interface LoadPushDown {
* the input are required. If the loader function cannot make use of this
* information, it is free to ignore it by returning an appropriate Response
* @param requiredFieldList RequiredFieldList indicating which columns will be needed.
+ * @return Indicates which fields will be returned
+ * @throws FrontendException
*/
public RequiredFieldResponse pushProjection(RequiredFieldList
requiredFieldList) throws FrontendException;
+ /**
+ * Describes a field that is required to execute a scripts.
+ */
+ @InterfaceAudience.Public
+ @InterfaceStability.Evolving
public static class RequiredField implements Serializable {
private static final long serialVersionUID = 1L;
@@ -152,6 +164,11 @@ public interface LoadPushDown {
}
}
+ /**
+ * List of fields that Pig knows to be required to executed a script.
+ */
+ @InterfaceAudience.Public
+ @InterfaceStability.Evolving
public static class RequiredFieldList implements Serializable {
private static final long serialVersionUID = 1L;
@@ -160,6 +177,7 @@ public interface LoadPushDown {
private List<RequiredField> fields = new ArrayList<RequiredField>();
/**
+ * Set the list of required fields.
* @param fields
*/
public RequiredFieldList(List<RequiredField> fields) {
@@ -167,6 +185,7 @@ public interface LoadPushDown {
}
/**
+ * Geta ll required fields as a list.
* @return the required fields - this will be null if all fields are
* required
*/
@@ -194,12 +213,21 @@ public interface LoadPushDown {
return result.toString();
}
+ /**
+ * Add a field to the list of required fields.
+ * @param rf required field to add to the list.
+ */
public void add(RequiredField rf)
{
fields.add(rf);
}
}
+ /**
+ * Indicates whether the loader will return the requested fields or all fields.
+ */
+ @InterfaceAudience.Public
+ @InterfaceStability.Evolving
public static class RequiredFieldResponse {
// the loader should pass true if it will return data containing
// only the List of RequiredFields in that order. false if it
@@ -210,20 +238,24 @@ public interface LoadPushDown {
this.requiredFieldRequestHonored = requiredFieldRequestHonored;
}
- // true if the loader will return data containing only the List of
- // RequiredFields in that order. false if the loader will return all
- // fields in the data
+ /**
+ * Indicates whether the loader will return only the requested fields or all fields.
+ * @return true if only requested fields will be returned, false if all fields will be
+ * returned.
+ */
public boolean getRequiredFieldResponse() {
return requiredFieldRequestHonored;
}
- // the loader should pass true if the it will return data containing
- // only the List of RequiredFields in that order. false if the it
- // will return all fields in the data
+ /**
+ * Set whether the loader will return only the requesetd fields or all fields.
+ * @param honored if true only requested fields will be returned, else all fields will be
+ * returned.
+ */
public void setRequiredFieldResponse(boolean honored) {
requiredFieldRequestHonored = honored;
}
}
-}
\ No newline at end of file
+}
Modified: hadoop/pig/trunk/src/org/apache/pig/Main.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/Main.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/Main.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/Main.java Wed Apr 14 23:44:16 2010
@@ -17,9 +17,24 @@
*/
package org.apache.pig;
-import java.io.*;
-import java.util.*;
-import java.util.jar.*;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.StringReader;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.jar.Attributes;
+import java.util.jar.JarFile;
+import java.util.jar.Manifest;
import java.text.ParseException;
import jline.ConsoleReader;
@@ -32,6 +47,9 @@ import org.apache.commons.logging.LogFac
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.ExecType;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileLocalizer;
@@ -46,6 +64,11 @@ import org.apache.pig.impl.util.LogUtils
import org.apache.pig.tools.timer.PerformanceTimerFactory;
import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
+/**
+ * Main class for Pig engine.
+ */
+@InterfaceAudience.LimitedPrivate({"Oozie"})
+@InterfaceStability.Stable
public class Main
{
@@ -61,7 +84,7 @@ public class Main
/**
* The Main-Class for the Pig Jar that will provide a shell and setup a classpath appropriate
- * for executing Jar files.
+ * for executing Jar files. Warning, this method calls System.exit().
*
* @param args
* -jar can be used to add additional jar files (colon separated). - will start a
@@ -542,6 +565,9 @@ private static String getVersionString()
}
}
+/**
+ * Print usage string.
+ */
public static void usage()
{
System.out.println("\n"+getVersionString()+"\n");
Modified: hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/OrderedLoadFunc.java Wed Apr 14 23:44:16 2010
@@ -23,21 +23,31 @@ import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
/**
* Implementing this interface indicates to Pig that a given loader
- * can be used for MergeJoin. The position as represented by the
+ * can be used for MergeJoin. It does not mean the data itself is ordered,
+ * but rather that the splits returned by the underlying InputFormat
+ * can be ordered to match the order of the data they are loading. For
+ * example, files splits have a natural order (that of the file they are
+ * from) while splits of RDBMS does not (since tables have no inherent order).
+ * The position as represented by the
* WritableComparable object is stored in the index created by
- * MergeJoin sampling MR job to get an ordered sequence of splits.
- * This is necessary when the sort key spans multiple splits.
+ * a MergeJoin sampling MapReduce job to get an ordered sequence of splits.
+ * It is necessary to read splits in order during a merge join to assure
+ * data is being read according to the sort order.
* @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Evolving // Since we haven't done outer join for merge join yet
public interface OrderedLoadFunc {
/**
* The WritableComparable object returned will be used to compare
* the position of different splits in an ordered stream
- * @param split
+ * @param split An InputSplit from the InputFormat underlying this loader.
* @return WritableComparable representing the position of the split in input
* @throws IOException
*/
Modified: hadoop/pig/trunk/src/org/apache/pig/PigServer.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/PigServer.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/PigServer.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/PigServer.java Wed Apr 14 23:44:16 2010
@@ -37,19 +37,26 @@ import java.util.Set;
import java.util.Stack;
import org.apache.hadoop.mapreduce.Job;
+
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
-import org.apache.pig.impl.plan.PlanException;
+
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+
import org.apache.pig.backend.datastorage.ContainerDescriptor;
import org.apache.pig.backend.datastorage.DataStorage;
import org.apache.pig.backend.datastorage.ElementDescriptor;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.backend.executionengine.ExecJob.JOB_STATUS;
+import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
+import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
import org.apache.pig.builtin.BinStorage;
import org.apache.pig.builtin.PigStorage;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.experimental.logical.LogicalPlanMigrationVistor;
@@ -75,9 +82,7 @@ import org.apache.pig.impl.logicalLayer.
import org.apache.pig.impl.logicalLayer.parser.QueryParser;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.validators.LogicalPlanValidationExecutor;
-import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil;
-import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
-import org.apache.pig.backend.hadoop.executionengine.physicalLayer.relationalOperators.POStore;
+import org.apache.pig.impl.plan.PlanException;
import org.apache.pig.impl.plan.CompilationMessageCollector;
import org.apache.pig.impl.plan.DependencyOrderWalker;
import org.apache.pig.impl.plan.DepthFirstWalker;
@@ -97,17 +102,24 @@ import org.apache.pig.tools.grunt.GruntP
/**
*
- * This class is the program's connection to Pig. Typically a program will create a PigServer
+ * A class for Java programs to connect to Pig. Typically a program will create a PigServer
* instance. The programmer then registers queries using registerQuery() and
* retrieves results using openIterator() or store(). After doing so, the
* shutdown() method should be called to free any resources used by the current
* PigServer instance. Not doing so could result in a memory leak.
*
*/
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public class PigServer {
private final Log log = LogFactory.getLog(getClass());
+ /**
+ * Given a string, determine the exec type.
+ * @param str accepted values are 'local', 'mapreduce', and 'mapred'
+ * @return exectype as ExecType
+ */
public static ExecType parseExecType(String str) throws IOException {
String normStr = str.toLowerCase();
@@ -159,10 +171,23 @@ public class PigServer {
return ""+(++scopeCounter);
}
+ /**
+ * @param execTypeString can be 'mapreduce' or 'local'. Local mode will
+ * use Hadoop's local job runner to execute the job on the local machine.
+ * Mapreduce mode will connect to a cluster to execute the job.
+ * @throws ExecException
+ * @throws IOException
+ */
public PigServer(String execTypeString) throws ExecException, IOException {
this(parseExecType(execTypeString));
}
+ /**
+ * @param execType execution type to start the engine. Local mode will
+ * use Hadoop's local job runner to execute the job on the local machine.
+ * Mapreduce mode will connect to a cluster to execute the job.
+ * @throws ExecException
+ */
public PigServer(ExecType execType) throws ExecException {
this(execType, PropertiesUtil.loadPropertiesFromFile());
}
@@ -215,16 +240,26 @@ public class PigServer {
return pigContext;
}
+ /**
+ * Set the logging level to DEBUG.
+ */
public void debugOn() {
Logger.getLogger("org.apache.pig").setLevel(Level.DEBUG);
pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", Level.DEBUG.toString());
}
+ /**
+ * Set the logging level to the default.
+ */
public void debugOff() {
Logger.getLogger("org.apache.pig").setLevel(pigContext.getDefaultLogLevel());
pigContext.getLog4jProperties().setProperty("log4j.logger.org.apache.pig", pigContext.getDefaultLogLevel().toString());
}
+ /**
+ * Set the default parallelism for this job
+ * @param p default number of reducers to use for this job.
+ */
public void setDefaultParallel(int p) {
pigContext.defaultParallel = p;
}
@@ -272,6 +307,7 @@ public class PigServer {
/**
* Submits a batch of Pig commands for execution.
*
+ * @return list of jobs being executed
* @throws FrontendException
* @throws ExecException
*/
@@ -294,7 +330,6 @@ public class PigServer {
* Discards a batch of Pig commands.
*
* @throws FrontendException
- * @throws ExecException
*/
public void discardBatch() throws FrontendException {
if (currDAG == null || !isBatchOn()) {
@@ -324,8 +359,8 @@ public class PigServer {
* @param function - the new function alias to define.
* @param functionSpec - the name of the function and any arguments.
* It should have the form: classname('arg1', 'arg2', ...)
+ * @deprecated Use {@link #registerFunction(String, FuncSpec)}
*/
- @Deprecated
public void registerFunction(String function, String functionSpec) {
registerFunction(function, new FuncSpec(functionSpec));
}
@@ -420,7 +455,7 @@ public class PigServer {
* @param query
* a Pig Latin expression to be evaluated.
* @param startLine
- * line number of the query within the whold script
+ * line number of the query within the whole script
* @throws IOException
*/
public void registerQuery(String query, int startLine) throws IOException {
@@ -439,10 +474,24 @@ public class PigServer {
return graph.getPlan(alias);
}
+ /**
+ * Register a query with the Pig runtime. The query is parsed and registered, but it is not
+ * executed until it is needed. Equivalent to calling {@link #registerQuery(String, int)}
+ * with startLine set to 1.
+ *
+ * @param query
+ * a Pig Latin expression to be evaluated.
+ * @throws IOException
+ */
public void registerQuery(String query) throws IOException {
registerQuery(query, 1);
}
+ /**
+ * Register a query with the Pig runtime. The query will be read from the indicated file.
+ * @param fileName file to read query from.
+ * @throws IOException
+ */
public void registerScript(String fileName) throws IOException {
try {
GruntParser grunt = new GruntParser(new FileReader(new File(fileName)));
@@ -460,10 +509,22 @@ public class PigServer {
}
}
+ /**
+ * Intended to be used by unit tests only.
+ * Print a list of all aliases in in the current Pig Latin script. Output is written to
+ * System.out.
+ * @throws FrontendException
+ */
public void printAliases () throws FrontendException {
System.out.println("aliases: " + currDAG.getAliasOp().keySet());
}
+ /**
+ * Write the schema for an alias to System.out.
+ * @param alias Alias whose schema will be written out
+ * @return Schema of alias dumped
+ * @throws IOException
+ */
public Schema dumpSchema(String alias) throws IOException{
try {
LogicalPlan lp = getPlanFromAlias(alias, "describe");
@@ -479,17 +540,42 @@ public class PigServer {
}
}
+ /**
+ * Set the name of the job. This name will get translated to mapred.job.name.
+ * @param name of job
+ */
public void setJobName(String name){
currDAG.setJobName(name);
}
+ /**
+ * Set Hadoop job priority. This value will get translated to mapred.job.priority.
+ * @param priority valid values are found in {@link org.apache.hadoop.mapred.JobPriority}
+ */
public void setJobPriority(String priority){
currDAG.setJobPriority(priority);
}
/**
- * Forces execution of query (and all queries from which it reads), in order to materialize
- * result
+ * Executes a Pig Latin script up to and including indicated alias. That is, if a user does:
+ * <pre>
+ * PigServer server = new PigServer();
+ * server.registerQuery("A = load 'foo';");
+ * server.registerQuery("B = filter A by $0 > 0;");
+ * server.registerQuery("C = order B by $1;");
+ * </pre>
+ * Then
+ * <pre>
+ * server.openIterator("B");
+ * </pre>
+ * filtered but unsorted data will be returned. If instead a user does
+ * <pre>
+ * server.openIterator("C");
+ * </pre>
+ * filtered and sorted data will be returned.
+ * @param id Alias to open iterator for
+ * @return iterator of tuples returned from the script
+ * @throws IOException
*/
public Iterator<Tuple> openIterator(String id) throws IOException {
try {
@@ -525,19 +611,61 @@ public class PigServer {
}
/**
- * Store an alias into a file
+ * Executes a Pig Latin script up to and including indicated alias and stores the resulting
+ * records into a file. That is, if a user does:
+ * <pre>
+ * PigServer server = new PigServer();
+ * server.registerQuery("A = load 'foo';");
+ * server.registerQuery("B = filter A by $0 > 0;");
+ * server.registerQuery("C = order B by $1;");
+ * </pre>
+ * Then
+ * <pre>
+ * server.store("B", "bar");
+ * </pre>
+ * filtered but unsorted data will be stored to the file <tt>bar</tt>. If instead a user does
+ * <pre>
+ * server.store("C", "bar");
+ * </pre>
+ * filtered and sorted data will be stored to the file <tt>bar</tt>.
+ * Equivalent to calling {@link #store(String, String, String)} with
+ * <tt>org.apache.pig.PigStorage</tt> as the store function.
* @param id The alias to store
* @param filename The file to which to store to
+ * @return {@link ExecJob} containing information about this job
* @throws IOException
*/
-
public ExecJob store(String id, String filename) throws IOException {
return store(id, filename, PigStorage.class.getName() + "()"); // SFPig is the default store function
}
/**
- * forces execution of query (and all queries from which it reads), in order to store result in file
- */
+ * Executes a Pig Latin script up to and including indicated alias and stores the resulting
+ * records into a file. That is, if a user does:
+ * <pre>
+ * PigServer server = new PigServer();
+ * server.registerQuery("A = load 'foo';");
+ * server.registerQuery("B = filter A by $0 > 0;");
+ * server.registerQuery("C = order B by $1;");
+ * </pre>
+ * Then
+ * <pre>
+ * server.store("B", "bar", "mystorefunc");
+ * </pre>
+ * filtered but unsorted data will be stored to the file <tt>bar</tt> using
+ * <tt>mystorefunc</tt>. If instead a user does
+ * <pre>
+ * server.store("C", "bar", "mystorefunc");
+ * </pre>
+ * filtered and sorted data will be stored to the file <tt>bar</tt> using
+ * <tt>mystorefunc</tt>.
+ * <p>
+ * @param id The alias to store
+ * @param filename The file to which to store to
+ * @param func store function to use
+ * @return {@link ExecJob} containing information about this job
+ * @throws IOException
+ */
public ExecJob store(
String id,
String filename,
@@ -597,7 +725,9 @@ public class PigServer {
/**
* Provide information on how a pig query will be executed.
* @param alias Name of alias to explain.
- * @param format Format in which the explain should be printed
+ * @param format Format in which the explain should be printed. If text, then the plan will
+ * be printed in plain text. Otherwise, the execution plan will be printed in
+ * <a href="http://en.wikipedia.org/wiki/DOT_language">DOT</a> format.
* @param verbose Controls the amount of information printed
* @param markAsExecute When set will treat the explain like a
* call to execute in the respoect that all the pending stores are
@@ -702,28 +832,59 @@ public class PigServer {
return length * replication;
}
+ /**
+ * Test whether a file exists.
+ * @param filename to test
+ * @return true if file exists, false otherwise
+ * @throws IOException
+ */
public boolean existsFile(String filename) throws IOException {
ElementDescriptor elem = pigContext.getDfs().asElement(filename);
return elem.exists();
}
+ /**
+ * Delete a file.
+ * @param filename to delete
+ * @return true
+ * @throws IOException
+ */
public boolean deleteFile(String filename) throws IOException {
ElementDescriptor elem = pigContext.getDfs().asElement(filename);
elem.delete();
return true;
}
+ /**
+ * Rename a file.
+ * @param source file to rename
+ * @param target new file name
+ * @return true
+ * @throws IOException
+ */
public boolean renameFile(String source, String target) throws IOException {
pigContext.rename(source, target);
return true;
}
+ /**
+ * Make a directory.
+ * @param dirs directory to make
+ * @return true
+ * @throws IOException
+ */
public boolean mkdirs(String dirs) throws IOException {
ContainerDescriptor container = pigContext.getDfs().asContainer(dirs);
container.create();
return true;
}
+ /**
+ * List the contents of a directory.
+ * @param dir name of directory to list
+ * @return array of strings, one for each file name
+ * @throws IOException
+ */
public String[] listPaths(String dir) throws IOException {
Collection<String> allPaths = new ArrayList<String>();
ContainerDescriptor container = pigContext.getDfs().asContainer(dir);
@@ -738,12 +899,19 @@ public class PigServer {
return allPaths.toArray(type);
}
+ /**
+ * Does not work at the moment.
+ */
public long totalHadoopTimeSpent() {
// TODO FIX Need to uncomment this with the right logic
// return MapReduceLauncher.totalHadoopTimeSpent;
return 0L;
}
+ /**
+ * Return a map containing the logical plan associated with each alias.
+ * @return map
+ */
public Map<String, LogicalPlan> getAliases() {
Map<String, LogicalPlan> aliasPlans = new HashMap<String, LogicalPlan>();
for(LogicalOperator op: currDAG.getAliases().keySet()) {
@@ -771,6 +939,10 @@ public class PigServer {
FileLocalizer.deleteTempFiles();
}
+ /**
+ * Get the set of all current aliases.
+ * @return set
+ */
public Set<String> getAliasKeySet() {
return currDAG.getAliasOp().keySet();
}
Modified: hadoop/pig/trunk/src/org/apache/pig/PigToStream.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/PigToStream.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/PigToStream.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/PigToStream.java Wed Apr 14 23:44:16 2010
@@ -19,6 +19,8 @@ package org.apache.pig;
import java.io.IOException;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
/**
@@ -29,16 +31,21 @@ import org.apache.pig.data.Tuple;
* a common protocol for data exchange between Pig runtime and streaming
* executables.
*
- * Typically, user implements this interface for a particular type of
+ * Typically, a user implements this interface for a particular type of
* stream command and specifies the implementation class in the Pig DEFINE
* statement.
- *
+ * @since Pig 0.7
*/
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public interface PigToStream {
/**
* Given a tuple, produce an array of bytes to be passed to the streaming
* executable.
+ * @param t Tuple to serialize
+ * @return Serialized form of the tuple
+ * @throws IOException
*/
public byte[] serialize(Tuple t) throws IOException;
Modified: hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/ResourceSchema.java Wed Apr 14 23:44:16 2010
@@ -25,11 +25,21 @@ import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema;
+/**
+ * A represenation of a schema used to communicate with load and store functions. This is
+ * separate from {@link Schema}, which is an internal Pig representation of a schema.
+ * @since Pig 0.7
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public class ResourceSchema implements Serializable {
private static final long serialVersionUID = 1L;
@@ -65,10 +75,17 @@ public class ResourceSchema implements S
// nested tuples and bags will have their own schema
private ResourceSchema schema;
+ /**
+ * Construct an empty field schema.
+ */
public ResourceFieldSchema() {
}
+ /**
+ * Construct using a {@link org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema} as the template.
+ * @param fieldSchema fieldSchema to copy from
+ */
public ResourceFieldSchema(FieldSchema fieldSchema) {
type = fieldSchema.type;
name = fieldSchema.alias;
@@ -90,36 +107,73 @@ public class ResourceSchema implements S
}
}
+ /**
+ * Get the name of this field.
+ * @return name
+ */
public String getName() {
return name;
}
+
+ /**
+ * Set the name of this filed.
+ * @param name new name
+ * @return this
+ */
public ResourceFieldSchema setName(String name) {
this.name = name;
return this;
}
+ /**
+ * Get the type of this field.
+ * @return type, as a {@link DataType} static final byte
+ */
public byte getType() {
return type;
}
+ /**
+ * Set the type of this field
+ * @param type new type
+ * @return this
+ */
public ResourceFieldSchema setType(byte type) {
this.type = type;
return this;
}
+ /**
+ * Get a free form text description of this field.
+ * @return description
+ */
public String getDescription() {
return description;
}
+ /**
+ * Set the description
+ * @param description new description
+ * @return this
+ */
public ResourceFieldSchema setDescription(String description) {
this.description = description;
return this;
}
+ /**
+ * Get the schema for this field. Only fields of type tuple should have a schema.
+ * @return schema
+ */
public ResourceSchema getSchema() {
return schema;
}
+ /**
+ * Set the schema for this field. Only fields of type tuple should have a schema.
+ * @param schema new schema
+ * @return this
+ */
public ResourceFieldSchema setSchema(ResourceSchema schema) throws
IOException {
validateSchema(schema);
@@ -127,9 +181,6 @@ public class ResourceSchema implements S
return this;
}
- /**
- * @param schema
- */
private void validateSchema(ResourceSchema schema) throws IOException {
if(type == DataType.BAG && schema != null) {
ResourceFieldSchema[] subFields = schema.getFields();
@@ -173,10 +224,17 @@ public class ResourceSchema implements S
}
+ /**
+ * Construct an empty ResourceSchema.
+ */
public ResourceSchema() {
}
+ /**
+ * Construct a ResourceSchema from a {@link Schema}
+ * @param pigSchema Schema to use
+ */
public ResourceSchema(Schema pigSchema) {
List<FieldSchema> pigSchemaFields = pigSchema.getFields();
fields = new ResourceFieldSchema[pigSchemaFields.size()];
@@ -185,6 +243,13 @@ public class ResourceSchema implements S
}
}
+ /**
+ * Only for use by Pig internal code.
+ * Construct a ResourceSchema from a {@link Schema}
+ * @param pigSchema Schema to use
+ * @param sortInfo information on how data is sorted
+ */
+ @InterfaceAudience.Private
public ResourceSchema(Schema pigSchema, SortInfo sortInfo) {
this(pigSchema);
if (sortInfo!=null && sortInfo.getSortColInfoList().size()!=0) {
@@ -206,6 +271,10 @@ public class ResourceSchema implements S
}
}
+ /**
+ * Get the version of this schema.
+ * @return version
+ */
public int getVersion() {
return version;
}
@@ -215,10 +284,18 @@ public class ResourceSchema implements S
return this;
}
+ /**
+ * Get field schema for each field
+ * @return array of field schemas.
+ */
public ResourceFieldSchema[] getFields() {
return fields;
}
+ /**
+ * Get all field names.
+ * @return array of field names
+ */
public String[] fieldNames() {
String[] names = new String[fields.length];
for (int i=0; i<fields.length; i++) {
@@ -227,32 +304,73 @@ public class ResourceSchema implements S
return names;
}
+ /**
+ * Set all the fields. If fields are not currently null the new fields will be silently
+ * ignored.
+ * @param fields to use as fields in this schema
+ * @return this
+ */
public ResourceSchema setFields(ResourceFieldSchema[] fields) {
if (fields != null)
this.fields = Arrays.copyOf(fields, fields.length);
return this;
}
+ /**
+ * Get the sort keys for this data.
+ * @return array of ints. Each integer in the array represents the field number. So if the
+ * schema of the data is (a, b, c, d) and the data is sorted on c, b, the returned sort keys
+ * will be [2, 1]. Field numbers are zero based. If the data is not sorted a zero length
+ * array will be returned.
+ */
public int[] getSortKeys() {
return sortKeys;
}
+ /**
+ * Set the sort keys for htis data. If sort keys are not currently null the new sort keys
+ * will be silently ignored.
+ * @param sortKeys Each integer in the array represents the field number. So if the
+ * schema of the data is (a, b, c, d) and the data is sorted on c, b, the sort keys
+ * should be [2, 1]. Field numbers are zero based.
+ * @return this
+ */
public ResourceSchema setSortKeys(int[] sortKeys) {
if (sortKeys != null)
this.sortKeys = Arrays.copyOf(sortKeys, sortKeys.length);
return this;
}
+ /**
+ * Get order for sort keys.
+ * @return array of Order. This array will be the same length as the int[] array returned by
+ * {@link #getSortKeys}.
+ */
public Order[] getSortKeyOrders() {
return sortKeyOrders;
}
+ /**
+ * Set the order for each sort key. If order is not currently null, new order will be
+ * silently ignored.
+ * @param sortKeyOrders array of Order. Should be the same length as int[] passed to
+ * {@link #setSortKeys}.
+ * @return this
+ */
public ResourceSchema setSortKeyOrders(Order[] sortKeyOrders) {
if (sortKeyOrders != null)
this.sortKeyOrders = Arrays.copyOf(sortKeyOrders, sortKeyOrders.length);
return this;
}
+ /**
+ * Test whether two ResourceSchemas are the same. Two schemas are said to be the same if they
+ * are both null or
+ * have the same number of fields and for each field the name, type are the same. For fields
+ * that have may have schemas (i.e. tuples) both schemas be equal. Field
+ * descriptions are not used in testing equality.
+ * @return true if equal according to the above definition, otherwise false.
+ */
public static boolean equals(ResourceSchema rs1, ResourceSchema rs2) {
if (rs1 == null) {
return rs2 == null ? true : false;
Modified: hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/ResourceStatistics.java Wed Apr 14 23:44:16 2010
@@ -20,6 +20,17 @@ package org.apache.pig;
import java.io.Serializable;
import java.util.Arrays;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
+
+/**
+ * An class that represents statistics about data to be loaded or stored. It is marked unstable
+ * because Pig does very little statistics collection at this point. If and when that
+ * functionality is added it is expected that this interface will change.
+ * @since Pig 0.7
+ */
+@InterfaceAudience.Public
+@InterfaceStability.Unstable
public class ResourceStatistics implements Cloneable {
/* Getters intentionally return mutable arrays instead of copies,
@@ -38,6 +49,9 @@ public class ResourceStatistics implemen
public Long avgRecordSize;
public ResourceFieldStatistics[] fields = new ResourceFieldStatistics[0];
+ /**
+ * Statistics for a given field in the data.
+ */
public static class ResourceFieldStatistics implements Serializable {
public static final long serialVersionUID = 1L;
@@ -46,28 +60,34 @@ public class ResourceStatistics implemen
public Long numDistinctValues; // number of distinct values represented in this field
- // We need some way to represent a histogram of values in the field,
- // as those will be useful. However, we can't count on being
- // able to hold such histograms in memory. Have to figure out
- // how they can be kept on disk and represented here.
-
- // for now.. don't create so many buckets you can't hold them in memory
-
- // an ordered array of the most common values,
- // in descending order of frequency
+ /**
+ * We need some way to represent a histogram of values in the field,
+ * as those will be useful. However, we can't count on being
+ * able to hold such histograms in memory. Have to figure out
+ * how they can be kept on disk and represented here.
+ *
+ * for now.. don't create so many buckets you can't hold them in memory
+ *
+ * an ordered array of the most common values,
+ * in descending order of frequency
+ */
public Object[] mostCommonValues = new Object[0];
- // an array that matches the mostCommonValues array, and lists
- // the frequencies of those values as a fraction (0 through 1) of
- // the total number of records
+ /**
+ * an array that matches the mostCommonValues array, and lists
+ * the frequencies of those values as a fraction (0 through 1) of
+ * the total number of records
+ */
public float[] mostCommonValuesFreq = new float[0];
- // an ordered array of values, from min val to max val
- // such that the number of records with values
- // between valueHistogram[i] and and valueHistogram[i+1] is
- // roughly equal for all values of i.
- // NOTE: if mostCommonValues is non-empty, the values in that array
- // should not be included in the histogram. Adjust accordingly.
+ /**
+ * an ordered array of values, from min val to max val
+ * such that the number of records with values
+ * between valueHistogram[i] and and valueHistogram[i+1] is
+ * roughly equal for all values of i.
+ * NOTE: if mostCommonValues is non-empty, the values in that array
+ * should not be included in the histogram. Adjust accordingly.
+ */
public Object[] valueHistogram = new Object[0];
@@ -254,4 +274,4 @@ public class ResourceStatistics implemen
public Object clone() throws CloneNotSupportedException {
return super.clone();
}
-}
\ No newline at end of file
+}
Modified: hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/SortColInfo.java Wed Apr 14 23:44:16 2010
@@ -19,16 +19,17 @@ package org.apache.pig;
import java.io.Serializable;
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.impl.util.Utils;
/**
* A class representing information about a sort column in {@link SortInfo}
*/
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
public class SortColInfo implements Serializable {
- /**
- *
- */
private static final long serialVersionUID = 1L;
// name of sort column
Modified: hadoop/pig/trunk/src/org/apache/pig/SortInfo.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/SortInfo.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/SortInfo.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/SortInfo.java Wed Apr 14 23:44:16 2010
@@ -21,12 +21,17 @@ package org.apache.pig;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.impl.util.Utils;
/**
* Class to communicate sort column information based on
* order by statment's sort columns and schema
*/
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
public class SortInfo implements Serializable {
/**
@@ -39,14 +44,14 @@ public class SortInfo implements Seriali
List<SortColInfo> sortColInfoList;
/**
- * @param sortColInfoList
+ * @param sortColInfoList list of sortColInfo, one for each field in the data
*/
public SortInfo(List<SortColInfo> sortColInfoList){
this.sortColInfoList = sortColInfoList;
}
/**
- * @return the sortColInfoList
+ * @return the sortColInfoList the list of sortColInfo for this data
*/
public List<SortColInfo> getSortColInfoList() {
return new ArrayList<SortColInfo>(sortColInfoList);
@@ -66,7 +71,8 @@ public class SortInfo implements Seriali
}
/**
- * @return the isGloballySorted
+ * @return the isGloballySorted true if the data is globally sorted, false if it is sorted
+ * only within each part file.
*/
public boolean isGloballySorted() {
return isGloballySorted;
Modified: hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/StandAloneParser.java Wed Apr 14 23:44:16 2010
@@ -32,6 +32,7 @@ import org.apache.pig.impl.logicalLayer.
import org.apache.pig.impl.logicalLayer.LogicalPlan;
import org.apache.pig.impl.logicalLayer.LogicalOperator;
+@Deprecated
public class StandAloneParser {
private static final Log log = LogFactory.getLog(StandAloneParser.class);
Modified: hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java
URL: http://svn.apache.org/viewvc/hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java?rev=934242&r1=934241&r2=934242&view=diff
==============================================================================
--- hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java (original)
+++ hadoop/pig/trunk/src/org/apache/pig/StoreFunc.java Wed Apr 14 23:44:16 2010
@@ -24,17 +24,19 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
+
+import org.apache.pig.classification.InterfaceAudience;
+import org.apache.pig.classification.InterfaceStability;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.UDFContext;
/**
-* This abstract class is used to implement functions to write records
-* from a dataset.
-*
-*
+* StoreFuncs take records from Pig's processing and store them into a data store. Most frequently
+* this is an HDFS file, but it could also be an HBase instance, RDBMS, etc.
*/
-
+@InterfaceAudience.Public
+@InterfaceStability.Stable
public abstract class StoreFunc implements StoreFuncInterface {
/**
@@ -51,7 +53,6 @@ public abstract class StoreFunc implemen
* in the script, this would be the home directory -
* <pre>/user/<username> </pre>
* @return the absolute location based on the arguments passed
- * @throws IOException
* @throws IOException if the conversion is not possible
*/
@Override
@@ -96,7 +97,8 @@ public abstract class StoreFunc implemen
* check that a given schema is acceptable to it. For example, it
* can check that the correct partition keys are included;
* a storage function to be written directly to an OutputFormat can
- * make sure the schema will translate in a well defined way.
+ * make sure the schema will translate in a well defined way. Default implementation
+ * is a no-op.
* @param s to be checked
* @throws IOException if this schema is not acceptable. It should include
* a detailed error message indicating what is wrong with the schema.
@@ -108,15 +110,14 @@ public abstract class StoreFunc implemen
/**
* Initialize StoreFunc to write data. This will be called during
- * execution before the call to putNext.
+ * execution on the backend before the call to putNext.
* @param writer RecordWriter to use.
* @throws IOException if an exception occurs during initialization
*/
public abstract void prepareToWrite(RecordWriter writer) throws IOException;
/**
- * Write a tuple the output stream to which this instance was
- * previously bound.
+ * Write a tuple to the data store.
*
* @param t the tuple to store.
* @throws IOException if an exception occurs during the write
@@ -128,7 +129,10 @@ public abstract class StoreFunc implemen
* pass a unique signature to the {@link StoreFunc} which it can use to store
* information in the {@link UDFContext} which it needs to store between
* various method invocations in the front end and back end. This method
- * will be called before other methods in {@link StoreFunc}.
+ * will be called before other methods in {@link StoreFunc}. This is necessary
+ * because in a Pig Latin script with multiple stores, the different
+ * instances of store functions need to be able to find their (and only their)
+ * data in the UDFContext object. The default implementation is a no-op.
* @param signature a unique signature to identify this StoreFunc
*/
@Override
@@ -140,7 +144,7 @@ public abstract class StoreFunc implemen
* This method will be called by Pig if the job which contains this store
* fails. Implementations can clean up output locations in this method to
* ensure that no incorrect/incomplete results are left in the output location.
- * The implementation in {@link StoreFunc} deletes the output location if it
+ * The default implementation deletes the output location if it
* is a {@link FileSystem} location.
* @param location Location returned by
* {@link StoreFunc#relToAbsPathForStoreLocation(String, Path)}
@@ -155,9 +159,10 @@ public abstract class StoreFunc implemen
}
/**
- * Implementation for {@link #cleanupOnFailure(String, Job)}
- * @param location
- * @param job
+ * Implementation for {@link #cleanupOnFailure(String, Job)}. This removes a file
+ * from HDFS.
+ * @param location file name (or URI) of file to remove
+ * @param job Hadoop job, used to access the appropriate file system.
* @throws IOException
*/
public static void cleanupOnFailureImpl(String location, Job job)