You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ol...@apache.org on 2010/10/21 01:08:40 UTC
svn commit: r1025789 - in /pig/trunk: CHANGES.txt build.xml
src/docs/src/documentation/content/xdocs/pigunit.xml
test/org/apache/pig/pigunit/PigTest.java
test/org/apache/pig/test/pigunit/TestPigTest.java
Author: olga
Date: Wed Oct 20 23:08:39 2010
New Revision: 1025789
URL: http://svn.apache.org/viewvc?rev=1025789&view=rev
Log:
PIG-1600: Docs update (romainr via olgan)
Modified:
pig/trunk/CHANGES.txt
pig/trunk/build.xml
pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml
pig/trunk/test/org/apache/pig/pigunit/PigTest.java
pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java
Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Oct 20 23:08:39 2010
@@ -42,6 +42,8 @@ PIG-1249: Safe-guards against misconfigu
IMPROVEMENTS
+PIG-1600: Docs update (romainr via olgan)
+
PIG-1632: The core jar in the tarball contains the kitchen sink (eli via olgan)
PIG-1617: 'group all' should always use one reducer (thejas)
Modified: pig/trunk/build.xml
URL: http://svn.apache.org/viewvc/pig/trunk/build.xml?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/build.xml (original)
+++ pig/trunk/build.xml Wed Oct 20 23:08:39 2010
@@ -669,8 +669,11 @@
<target depends="compile-test" name="pigunit-jar" description="create the pigunit jar file">
<echo> *** Creating pigunit.jar ***</echo>
<jar destfile="${pigunit.jarfile}">
- <fileset dir="${test.build.classes}/org/apache/pig/pigunit/"/>
- <zipfileset src="${ivy.lib.dir}/${commons-lang.jarfile}" />
+ <fileset dir="${test.build.classes}">
+ <include name="**/org/apache/pig/pigunit/**"/>
+ <include name="**/org/apache/pig/test/Util.**"/>
+ </fileset>
+ <zipfileset src="${ivy.lib.dir}/${commons-lang.jarfile}" />
</jar>
</target>
Modified: pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml Wed Oct 20 23:08:39 2010
@@ -20,202 +20,175 @@
<section>
<title>Overview</title>
- <p>The goal is to provide a simple xUnit framework that enables our Pig scripts to be easily:
+ <p>PigUnit is a simple xUnit framework that enables you to easily test your Pig scripts.
+ With
+ PigUnit you can perform unit testing, regression testing, and rapid prototyping.
+ No cluster
+ set up is required if you run Pig in local mode.
</p>
- <ol>
- <li>
- <p>unit tested</p>
- </li>
- <li>
- <p>regression tested</p>
- </li>
- <li>
- <p>quickly prototyped</p>
- </li>
- </ol>
-
- <p>No cluster set up is required.</p>
</section>
<section>
<title>PigUnit Example</title>
- <p>Computing top queries, specifying the input data and expected output of the script.</p>
- <p>Java test</p>
- <source>
+ <p>We want to compute a top N of the most common queries.
+ The Pig script is basic and very
+ similar to the Query Phrase Popularity in the Pig tutorial.
+ It
+ expects in input a file of
+ queries and a parameter n
+ (n is 2 in our case in order to do a top 2).
+ </p>
+ <p>Setting up a test for this script is simple as the argument and the input data are
+ specified by just two arrays of text. It is the same for the expected output of the
+ script
+ that will be compared to the actual result of the execution of the Pig script.
+ </p>
+ <p>
+ Many examples are available in the
+ <a
+ href="http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java"
+ >PigUnit tests</a>
+ .
+ </p>
+
+ <section>
+ <title>Java test</title>
+ <source>
@Test
- public void testTop3Queries() {
+ public void testTop2Queries() {
String[] args = {
- "n=3",
+ "n=2",
};
- test = new PigTest("top_queries.pig", args);
-
+
+ PigTest test = new PigTest("top_queries.pig", args);
+
String[] input = {
- "yahoo\t10",
- "twitter\t7",
- "facebook\t10",
- "yahoo\t15",
- "facebook\t5",
- ....
+ "yahoo",
+ "yahoo",
+ "yahoo",
+ "twitter",
+ "facebook",
+ "facebook",
+ "linkedin",
};
-
+
String[] output = {
- "(yahoo,25L)",
- "(facebook,15L)",
- "(twitter,7L)",
+ "(yahoo,3)",
+ "(facebook,2)",
};
-
+
test.assertOutput("data", input, "queries_limit", output);
}
- </source>
- <p>top_queries.pig</p>
- <source>
+</source>
+ </section>
+
+ <section>
+ <title>top_queries.pig</title>
+ <source>
data =
- LOAD '$input'
- AS (query:CHARARRAY, count:INT);
+ LOAD 'input'
+ AS (query:CHARARRAY);
- ...
+queries_group =
+ GROUP data
+ BY query;
-queries_sum =
+queries_count =
FOREACH queries_group
GENERATE
group AS query,
- SUM(queries.count) AS count;
+ COUNT(data) AS total;
- ...
+queries_ordered =
+ ORDER queries_count
+ BY total DESC, query;
-queries_limit = LIMIT queries_ordered $n;
+queries_limit =
+ LIMIT queries_ordered $n;
-STORE queries_limit INTO '$output';
+STORE queries_limit INTO 'output';
</source>
+ </section>
- <p>You just need two jar files in your classpath:</p>
- <ol>
- <li>pig.jar</li>
- <li>pigunit.jar</li>
- </ol>
+ <section>
+ <title>Run</title>
+ <p>Then the test can be executed by JUnit (or any other Java testing framework). It
+ requires:
+ </p>
+ <ol>
+ <li>pig.jar</li>
+ <li>pigunit.jar</li>
+ </ol>
+
+ <p>It takes about 25s to run and should pass.
+ In case of error (for example change the
+ parameter n to n=3),
+ the diff of output is displayed:
+ </p>
+
+ <source>
+junit.framework.ComparisonFailure: null expected:<...ahoo,3)
+(facebook,2)[]> but was:<...ahoo,3)
+(facebook,2)[
+(linkedin,1)]>
+ at junit.framework.Assert.assertEquals(Assert.java:81)
+ at junit.framework.Assert.assertEquals(Assert.java:87)
+ at org.apache.pig.pigunit.PigTest.assertEquals(PigTest.java:272)
+</source>
+ </section>
+ </section>
+
+ <section>
+ <title>Running in Local Mode</title>
<p>
- Many examples are available in the
- <a
- href="http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java"
- >PigUnit tests</a>.
+ Pig runs in local mode by default.
+ Local mode is fast and enables you to use your local file
+ system as the HDFS cluster.
+ Local mode does not require a real cluster but a new local one is
+ created each time.
</p>
</section>
<section>
- <title>Cluster</title>
-
- <p>They are 2 main modes:</p>
- <ol>
- <li>LOCAL</li>
- <li>MAPREDUCE</li>
- </ol>
-
- <section>
- <title>LOCAL</title>
- <p>
- This is using the local mode of Pig.
- It will be used by default.
- </p>
-
- <p>It will go fast and use your local file system as a HDFS cluster.</p>
- </section>
-
+ <title>Running in Mapreduce Mode</title>
+ <p>Pig also runs in mapreduce mode.
+ This mode requires you to use a Hadoop cluster.
+ The cluster
+ you select must be specified in the CLASSPATH
+ (similar to the HADOOP_CONF_DIR variable).
+ </p>
- <section>
- <title>MAPREDUCE</title>
- <p>This is using a real Hadoop cluster.
- The cluster selected will be the first specified in
- the CLASSPATH (same
- way as the HADOOP_CONF_DIR variable works). You
- can also choose to have
- a test cluster automatically
- starting/stopping or you cab reuse an already
- running cluster.
- </p>
+ <p>Notice that PigUnit comes with a standalone MiniCluster that
+ can be started
+ externally with:
+ </p>
- <section>
- <title>On demand cluster</title>
- <p>
- The default mode is using a local MiniCluster that is started at the very beginning
- and
- shutdown automatically at the end of the test run.
- No setup needed which is really
- helpful. The cluster will contain no data each time it is
- started, but data can be
- copied
- to it as shown in the examples.
-
- You can select this mode by setting the Java property
- <code>"pigunit.exectype.minicluster"</code>
- to "true".
- </p>
- <p>It can be set in Java or on the command line:</p>
- <ol>
- <li>
- <code>System.setProperty("pigunit.exectype.cluster", "true");</code>
- </li>
- <li>
- <code>-Dpigunit.exectype.cluste=true</code>
- </li>
- </ol>
- <p>
- The
- <code>HADOOP_CONF_DIR</code>
- path will be
- <code>~/pigtest/conf</code>
- and it will be required in the CLASSPATH.
- The path to the log directory is set by the
- Java property
- <code>"hadoop.log.dir"</code>
- (default is "/tmp/pigunit").
- </p>
- </section>
-
- <section>
- <title>Existing cluster</title>
- <p>
- If
- <code>"pigunit.exectype.cluster"</code>
- property is set, the first xml configuration of an Hadoop cluster found in the
- CLASSPATH
- will be used.
-
- Notice that PigUnit comes with a standalone MiniCluster that
- can be started
- externally with:
- </p>
- <source>
+ <source>
java -cp .../pig.jar:.../pigunit.jar org.apache.pig.pigunit.MiniClusterRunner
</source>
- <p>This is really useful when doing some prototyping in order to have a test cluster
- ready.</p>
- </section>
- </section>
+ <p>This is useful when doing some prototyping in order to have a test cluster
+ ready.
+ </p>
</section>
<section>
- <title>Building</title>
- <p>In order to compile pigunit.jar, go in pig trunk:</p>
+ <title>Building PigUnit</title>
+ <p>To compile PigUnit (pigunit.jar), run this command from the Pig trunk:</p>
<source>
-$pig_trunk ant compile-test
-$pig_trunk ant
$pig_trunk ant pigunit-jar
</source>
</section>
<section>
- <title>Troubleshooting</title>
- <p>Common problems</p>
+ <title>Troubleshooting Tips</title>
+ <p>Common problems you may encounter are discussed below.</p>
<section>
- <title>CLASSPATH in MAPREDUCE mode</title>
- <p>When used in MAPREDUCE mode, do not forget the HADOOP_CONF_DIR of your cluster in
- your
- CLASSPATH.</p>
+ <title>Classpath in Mapreduce mode</title>
+ <p>When using PigUnit in mapreduce mode, be sure to include the $HADOOP_CONF_DIR of the
+ cluster in your CLASSPATH.</p>
<p>
- It is
- <code>~/pigtest/conf</code>
- by default
+ The default value is ~/pigtest/conf.
</p>
<source>
org.apache.pig.backend.executionengine.ExecException: ERROR 4010: Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath).If you plan to use local mode, please put -x local option in command line
@@ -223,7 +196,7 @@ org.apache.pig.backend.executionengine.E
</section>
<section>
- <title>UDF jars not found</title>
+ <title>UDF jars Not Found</title>
<p>This error means that you are missing some jars in your test environment.</p>
<source>
WARN util.JarManager: Couldn't find the jar for org.apache.pig.piggybank.evaluation.string.LOWER, skip it
@@ -231,10 +204,9 @@ WARN util.JarManager: Couldn't find the
</section>
<section>
- <title>STORING data</title>
- <p>Currently pig is dropping all the STORE/DUMP commands but you can tell PigUnit to
- keep
- them and execute the script.</p>
+ <title>Storing data</title>
+ <p>Pig currently drops all STORE and DUMP commands. You can tell PigUnit to keep the
+ commands and execute the script:</p>
<source>
test = new PigTest(PIG_SCRIPT, args);
test.unoverride("STORE");
@@ -244,26 +216,23 @@ test.runScript();
<section>
<title>Cache archive</title>
- <p>It works, your test environment will need to have the cache archive options
- specified by
- Java properties or in an additional XML configuration in its
- CLASSPATH.</p>
- <p>If you use a local cluster, you will need to set the required environment
- variables before
- starting it, e.g.</p>
+ <p>For cache archive to work, your test environment needs to have the cache archive options
+ specified by Java properties or in an additional XML configuration in its CLASSPATH.</p>
+ <p>If you use a local cluster, you need to set the required environment variables before
+ starting it:</p>
<source>export LD_LIBRARY_PATH=/home/path/to/lib</source>
</section>
</section>
<section>
- <title>Future</title>
+ <title>Future Enhancements</title>
<p>Improvement and other components based on PigUnit that could be built later.</p>
- <p>We could build on top of PigTest a PigTestCase and PigTestSuite in order to have:</p>
+ <p>For example, we could build a PigTestCase and PigTestSuite on top of PigTest to:</p>
<ol>
- <li>notion of workspaces for each test</li>
- <li>removing the boiler plate code appearing when there is more than one test
- methods</li>
- <li>standalone utility that reads test configuration and generates a test report...</li>
+ <li>Add the notion of workspaces for each test.</li>
+ <li>Remove the boiler plate code appearing when there is more than one test methods.</li>
+ <li>Add a standalone utility that reads test configurations and generates a test report.
+ </li>
</ol>
</section>
</body>
Modified: pig/trunk/test/org/apache/pig/pigunit/PigTest.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/pigunit/PigTest.java?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/pigunit/PigTest.java (original)
+++ pig/trunk/test/org/apache/pig/pigunit/PigTest.java Wed Oct 20 23:08:39 2010
@@ -36,7 +36,6 @@ import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.apache.pig.pigunit.pig.PigServer;
-import org.apache.pig.test.MiniCluster;
import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
import org.apache.pig.tools.parameters.ParseException;
@@ -61,7 +60,6 @@ public class PigTest {
private static PigServer pig;
private static Cluster cluster;
private static final Logger LOG = Logger.getLogger(PigTest.class);
- private static final String EXEC_MINI_CLUSTER = "pigunit.exectype.minicluster";
private static final String EXEC_CLUSTER = "pigunit.exectype.cluster";
/**
@@ -121,14 +119,7 @@ public class PigTest {
*/
public static Cluster getCluster() throws ExecException {
if (cluster == null) {
- LOG.info("Using mini cluster mode");
- if (System.getProperties().containsKey(EXEC_MINI_CLUSTER)) {
- if (! System.getProperties().containsKey("hadoop.log.dir")) {
- System.setProperty("hadoop.log.dir", "/tmp/pigunit");
- }
- MiniCluster.buildCluster();
- pig = new PigServer(ExecType.MAPREDUCE);
- } else if (System.getProperties().containsKey(EXEC_CLUSTER)) {
+ if (System.getProperties().containsKey(EXEC_CLUSTER)) {
LOG.info("Using cluster mode");
pig = new PigServer(ExecType.MAPREDUCE);
} else {
@@ -149,6 +140,8 @@ public class PigTest {
* @throws ParseException The pig script could not have all its variables substituted.
*/
protected void registerScript() throws IOException, ParseException {
+ PigTest.getCluster();
+
BufferedReader pigIStream = new BufferedReader(new StringReader(this.originalTextPigScript));
StringWriter pigOStream = new StringWriter();
@@ -156,7 +149,7 @@ public class PigTest {
ps.genSubstitutedFile(pigIStream, pigOStream, args, argFiles);
String substitutedPig = pigOStream.toString();
- System.out.println(substitutedPig);
+ LOG.info(substitutedPig);
File f = File.createTempFile("tmp", "pigunit");
PrintWriter pw = new PrintWriter(f);
Modified: pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java (original)
+++ pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java Wed Oct 20 23:08:39 2010
@@ -34,7 +34,7 @@ import org.junit.Test;
* <ul>
* <li>pig.jar</li>
* <li>pigunit.jar</li>
- * <li>hadoop_conf_dir to current/future cluster if not using LOCAL mode</li>
+ * <li>$HADOOP_CONF_DIR to current/future cluster if not using LOCAL mode</li>
* </ul>
*/
public class TestPigTest {