You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pig.apache.org by ol...@apache.org on 2010/10/21 01:08:40 UTC
svn commit: r1025789 - in /pig/trunk: CHANGES.txt build.xml src/docs/src/documentation/content/xdocs/pigunit.xml test/org/apache/pig/pigunit/PigTest.java test/org/apache/pig/test/pigunit/TestPigTest.java

Author: olga
Date: Wed Oct 20 23:08:39 2010
New Revision: 1025789

URL: http://svn.apache.org/viewvc?rev=1025789&view=rev
Log:
PIG-1600: Docs update (romainr via olgan)

Modified:
    pig/trunk/CHANGES.txt
    pig/trunk/build.xml
    pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml
    pig/trunk/test/org/apache/pig/pigunit/PigTest.java
    pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java

Modified: pig/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/pig/trunk/CHANGES.txt?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/CHANGES.txt (original)
+++ pig/trunk/CHANGES.txt Wed Oct 20 23:08:39 2010
@@ -42,6 +42,8 @@ PIG-1249: Safe-guards against misconfigu
 
 IMPROVEMENTS
 
+PIG-1600: Docs update (romainr via olgan)
+
 PIG-1632: The core jar in the tarball contains the kitchen sink (eli via olgan)
 
 PIG-1617: 'group all' should always use one reducer (thejas)

Modified: pig/trunk/build.xml
URL: http://svn.apache.org/viewvc/pig/trunk/build.xml?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/build.xml (original)
+++ pig/trunk/build.xml Wed Oct 20 23:08:39 2010
@@ -669,8 +669,11 @@
     <target depends="compile-test" name="pigunit-jar" description="create the pigunit jar file">
         <echo> *** Creating pigunit.jar ***</echo>
       <jar destfile="${pigunit.jarfile}">
-        <fileset dir="${test.build.classes}/org/apache/pig/pigunit/"/>
-      	<zipfileset src="${ivy.lib.dir}/${commons-lang.jarfile}" />
+          <fileset dir="${test.build.classes}">
+              <include name="**/org/apache/pig/pigunit/**"/>
+              <include name="**/org/apache/pig/test/Util.**"/>
+          </fileset>
+      	  <zipfileset src="${ivy.lib.dir}/${commons-lang.jarfile}" />
       </jar>
     </target>
 

Modified: pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml
URL: http://svn.apache.org/viewvc/pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml (original)
+++ pig/trunk/src/docs/src/documentation/content/xdocs/pigunit.xml Wed Oct 20 23:08:39 2010
@@ -20,202 +20,175 @@
 
     <section>
       <title>Overview</title>
-      <p>The goal is to provide a simple xUnit framework that enables our Pig scripts to be easily:
+      <p>PigUnit is a simple xUnit framework that enables you to easily test your Pig scripts.
+        With
+        PigUnit you can perform unit testing, regression testing, and rapid prototyping.
+        No cluster
+        set up is required if you run Pig in local mode.
       </p>
-      <ol>
-        <li>
-          <p>unit tested</p>
-        </li>
-        <li>
-          <p>regression tested</p>
-        </li>
-        <li>
-          <p>quickly prototyped</p>
-        </li>
-      </ol>
-
-      <p>No cluster set up is required.</p>
     </section>
 
     <section>
       <title>PigUnit Example</title>
-      <p>Computing top queries, specifying the input data and expected output of the script.</p>
-      <p>Java test</p>
-      <source>
+      <p>We want to compute a top N of the most common queries.
+        The Pig script is basic and very
+        similar to the Query Phrase Popularity in the Pig tutorial.
+        It
+        expects in input a file of
+        queries and a parameter n
+        (n is 2 in our case in order to do a top 2). 
+      </p>
+      <p>Setting up a test for this script is simple as the argument and the input data are
+        specified by just two arrays of text. It is the same for the expected output of the
+        script
+        that will be compared to the actual result of the execution of the Pig script. 
+      </p>
+      <p>
+        Many examples are available in the
+        <a
+          href="http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java"
+        >PigUnit tests</a>
+        .
+      </p>
+
+      <section>
+        <title>Java test</title>
+        <source>
   @Test
-  public void testTop3Queries() {
+  public void testTop2Queries() {
     String[] args = {
-        "n=3",        
+        "n=2",
         };
-    test = new PigTest("top_queries.pig", args);
-
+ 
+    PigTest test = new PigTest("top_queries.pig", args);
+ 
     String[] input = {
-        "yahoo\t10",
-        "twitter\t7",
-        "facebook\t10",
-        "yahoo\t15",
-        "facebook\t5",
-        ....
+        "yahoo",
+        "yahoo",
+        "yahoo",
+        "twitter",
+        "facebook",
+        "facebook",
+        "linkedin",
     };
-
+ 
     String[] output = {
-        "(yahoo,25L)",
-        "(facebook,15L)",
-        "(twitter,7L)",
+        "(yahoo,3)",
+        "(facebook,2)",
     };
-
+ 
     test.assertOutput("data", input, "queries_limit", output);
   }
- </source>
-      <p>top_queries.pig</p>
-      <source>
+</source>
+      </section>
+
+      <section>
+        <title>top_queries.pig</title>
+        <source>
 data =
-    LOAD '$input'
-    AS (query:CHARARRAY, count:INT);
+    LOAD 'input'
+    AS (query:CHARARRAY);
      
-    ... 
+queries_group =
+    GROUP data
+    BY query; 
     
-queries_sum = 
+queries_count = 
     FOREACH queries_group 
     GENERATE 
         group AS query, 
-        SUM(queries.count) AS count;
+        COUNT(data) AS total;
         
-    ...
+queries_ordered =
+    ORDER queries_count
+    BY total DESC, query;
             
-queries_limit = LIMIT queries_ordered $n;
+queries_limit =
+    LIMIT queries_ordered $n;
 
-STORE queries_limit INTO '$output';
+STORE queries_limit INTO 'output';
 </source>
+      </section>
 
-      <p>You just need two jar files in your classpath:</p>
-      <ol>
-        <li>pig.jar</li>
-        <li>pigunit.jar</li>
-      </ol>
+      <section>
+        <title>Run</title>
 
+        <p>Then the test can be executed by JUnit (or any other Java testing framework). It
+          requires:
+        </p>
+        <ol>
+          <li>pig.jar</li>
+          <li>pigunit.jar</li>
+        </ol>
+
+        <p>It takes about 25s to run and should pass.
+          In case of error (for example change the
+          parameter n to n=3),
+          the diff of output is displayed:
+        </p>
+
+        <source>
+junit.framework.ComparisonFailure: null expected:&lt;...ahoo,3)
+(facebook,2)[]&gt; but was:&lt;...ahoo,3)
+(facebook,2)[
+(linkedin,1)]&gt;
+        at junit.framework.Assert.assertEquals(Assert.java:81)
+        at junit.framework.Assert.assertEquals(Assert.java:87)
+        at org.apache.pig.pigunit.PigTest.assertEquals(PigTest.java:272)
+</source>
+      </section>
+    </section>
+
+    <section>
+      <title>Running in Local Mode</title>
       <p>
-        Many examples are available in the
-        <a
-          href="http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java"
-        >PigUnit tests</a>.
+        Pig runs in local mode by default.
+        Local mode is fast and enables you to use your local file
+        system as the HDFS cluster.
+        Local mode does not require a real cluster but a new local one is
+        created each time. 
       </p>
     </section>
 
     <section>
-      <title>Cluster</title>
-
-      <p>They are 2 main modes:</p>
-      <ol>
-        <li>LOCAL</li>
-        <li>MAPREDUCE</li>
-      </ol>
-
-      <section>
-        <title>LOCAL</title>
-        <p>
-          This is using the local mode of Pig.
-          It will be used by default.
-        </p>
-
-        <p>It will go fast and use your local file system as a HDFS cluster.</p>
-      </section>
-
+      <title>Running in Mapreduce Mode</title>
+      <p>Pig also runs in mapreduce mode.
+        This mode requires you to use a Hadoop cluster.
+        The cluster
+        you select must be specified in the CLASSPATH
+        (similar to the HADOOP_CONF_DIR variable).
+      </p>
 
-      <section>
-        <title>MAPREDUCE</title>
-        <p>This is using a real Hadoop cluster.
-          The cluster selected will be the first specified in
-          the CLASSPATH (same
-          way as the HADOOP_CONF_DIR variable works). You
-          can also choose to have
-          a test cluster automatically
-          starting/stopping or you cab reuse an already
-          running cluster.
-        </p>
+      <p>Notice that PigUnit comes with a standalone MiniCluster that
+        can be started
+        externally with:
+      </p>
 
-        <section>
-          <title>On demand cluster</title>
-          <p>
-            The default mode is using a local MiniCluster that is started at the very beginning
-            and
-            shutdown automatically at the end of the test run.
-            No setup needed which is really
-            helpful. The cluster will contain no data each time it is
-            started, but data can be
-            copied
-            to it as shown in the examples.
-
-            You can select this mode by setting the Java property
-            <code>"pigunit.exectype.minicluster"</code>
-            to "true".
-            </p>
-            <p>It can be set in Java or on the command line:</p>
-            <ol>
-              <li>
-                <code>System.setProperty("pigunit.exectype.cluster", "true");</code>
-              </li>
-              <li>
-                <code>-Dpigunit.exectype.cluste=true</code>
-              </li>
-            </ol>
-            <p>
-            The
-            <code>HADOOP_CONF_DIR</code>
-            path will be
-            <code>~/pigtest/conf</code>
-            and it will be required in the CLASSPATH.
-            The path to the log directory is set by the
-            Java property
-            <code>"hadoop.log.dir"</code>
-            (default is "/tmp/pigunit").
-          </p>
-        </section>
-
-        <section>
-          <title>Existing cluster</title>
-          <p>
-            If
-            <code>"pigunit.exectype.cluster"</code>
-            property is set, the first xml configuration of an Hadoop cluster found in the
-            CLASSPATH
-            will be used.
-
-            Notice that PigUnit comes with a standalone MiniCluster that
-            can be started
-            externally with:
-          </p>
-          <source>
+      <source>
 java -cp .../pig.jar:.../pigunit.jar org.apache.pig.pigunit.MiniClusterRunner
 </source>
-          <p>This is really useful when doing some prototyping in order to have a test cluster
-            ready.</p>
-        </section>
-      </section>
+      <p>This is useful when doing some prototyping in order to have a test cluster
+        ready.
+     </p>
     </section>
 
     <section>
-      <title>Building</title>
-      <p>In order to compile pigunit.jar, go in pig trunk:</p>
+      <title>Building PigUnit</title>
+      <p>To compile PigUnit (pigunit.jar), run this command from the Pig trunk:</p>
       <source>
-$pig_trunk ant compile-test
-$pig_trunk ant
 $pig_trunk ant pigunit-jar   
 </source>
     </section>
 
     <section>
-      <title>Troubleshooting</title>
-      <p>Common problems</p>
+      <title>Troubleshooting Tips</title>
+      <p>Common problems you may encounter are discussed below.</p>
       <section>
-        <title>CLASSPATH in MAPREDUCE mode</title>
-        <p>When used in MAPREDUCE mode, do not forget the HADOOP_CONF_DIR of your cluster in
-          your
-          CLASSPATH.</p>
+        <title>Classpath in Mapreduce mode</title>
+        <p>When using PigUnit in mapreduce mode, be sure to include the $HADOOP_CONF_DIR of the
+          cluster in your CLASSPATH.</p>
         <p>
-          It is
-          <code>~/pigtest/conf</code>
-          by default
+          The default value is ~/pigtest/conf.
         </p>
         <source>
 org.apache.pig.backend.executionengine.ExecException: ERROR 4010: Cannot find hadoop configurations in classpath (neither hadoop-site.xml nor core-site.xml was found in the classpath).If you plan to use local mode, please put -x local option in command line
@@ -223,7 +196,7 @@ org.apache.pig.backend.executionengine.E
       </section>
 
       <section>
-        <title>UDF jars not found</title>
+        <title>UDF jars Not Found</title>
         <p>This error means that you are missing some jars in your test environment.</p>
         <source>
 WARN util.JarManager: Couldn't find the jar for org.apache.pig.piggybank.evaluation.string.LOWER, skip it
@@ -231,10 +204,9 @@ WARN util.JarManager: Couldn't find the 
       </section>
 
       <section>
-        <title>STORING data</title>
-        <p>Currently pig is dropping all the STORE/DUMP commands but you can tell PigUnit to
-          keep
-          them and execute the script.</p>
+        <title>Storing data</title>
+        <p>Pig currently drops all STORE and DUMP commands. You can tell PigUnit to keep the
+          commands and execute the script:</p>
         <source>
 test = new PigTest(PIG_SCRIPT, args);   
 test.unoverride("STORE");
@@ -244,26 +216,23 @@ test.runScript();
 
       <section>
         <title>Cache archive</title>
-        <p>It works, your test environment will need to have the cache archive options
-          specified by
-          Java properties or in an additional XML configuration in its
-          CLASSPATH.</p>
-        <p>If you use a local cluster, you will need to set the required environment
-          variables before
-          starting it, e.g.</p>
+        <p>For cache archive to work, your test environment needs to have the cache archive options
+          specified by Java properties or in an additional XML configuration in its CLASSPATH.</p>
+        <p>If you use a local cluster, you need to set the required environment variables before
+          starting it:</p>
         <source>export LD_LIBRARY_PATH=/home/path/to/lib</source>
       </section>
     </section>
 
     <section>
-      <title>Future</title>
+      <title>Future Enhancements</title>
       <p>Improvement and other components based on PigUnit that could be built later.</p>
-      <p>We could build on top of PigTest a PigTestCase and PigTestSuite in order to have:</p>
+      <p>For example, we could build a PigTestCase and PigTestSuite on top of PigTest to:</p>
       <ol>
-        <li>notion of workspaces for each test</li>
-        <li>removing the boiler plate code appearing when there is more than one test
-          methods</li>
-        <li>standalone utility that reads test configuration and generates a test report...</li>
+        <li>Add the notion of workspaces for each test.</li>
+        <li>Remove the boiler plate code appearing when there is more than one test methods.</li>
+        <li>Add a standalone utility that reads test configurations and generates a test report.
+        </li>
       </ol>
     </section>
   </body>

Modified: pig/trunk/test/org/apache/pig/pigunit/PigTest.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/pigunit/PigTest.java?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/pigunit/PigTest.java (original)
+++ pig/trunk/test/org/apache/pig/pigunit/PigTest.java Wed Oct 20 23:08:39 2010
@@ -36,7 +36,6 @@ import org.apache.pig.data.DataType;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.impl.logicalLayer.schema.Schema;
 import org.apache.pig.pigunit.pig.PigServer;
-import org.apache.pig.test.MiniCluster;
 import org.apache.pig.tools.parameters.ParameterSubstitutionPreprocessor;
 import org.apache.pig.tools.parameters.ParseException;
 
@@ -61,7 +60,6 @@ public class PigTest {
   private static PigServer pig;
   private static Cluster cluster;
   private static final Logger LOG = Logger.getLogger(PigTest.class);
-  private static final String EXEC_MINI_CLUSTER = "pigunit.exectype.minicluster";
   private static final String EXEC_CLUSTER = "pigunit.exectype.cluster";
 
   /**
@@ -121,14 +119,7 @@ public class PigTest {
    */
   public static Cluster getCluster() throws ExecException {
     if (cluster == null) {
-      LOG.info("Using mini cluster mode");
-      if (System.getProperties().containsKey(EXEC_MINI_CLUSTER)) {
-        if (! System.getProperties().containsKey("hadoop.log.dir")) {
-          System.setProperty("hadoop.log.dir", "/tmp/pigunit");
-        }
-        MiniCluster.buildCluster();
-        pig = new PigServer(ExecType.MAPREDUCE);
-      } else if (System.getProperties().containsKey(EXEC_CLUSTER)) {
+      if (System.getProperties().containsKey(EXEC_CLUSTER)) {
         LOG.info("Using cluster mode");
         pig = new PigServer(ExecType.MAPREDUCE);
       } else {
@@ -149,6 +140,8 @@ public class PigTest {
    * @throws ParseException The pig script could not have all its variables substituted.
    */
   protected void registerScript() throws IOException, ParseException {
+    PigTest.getCluster();
+
     BufferedReader pigIStream = new BufferedReader(new StringReader(this.originalTextPigScript));
     StringWriter pigOStream = new StringWriter();
 
@@ -156,7 +149,7 @@ public class PigTest {
     ps.genSubstitutedFile(pigIStream, pigOStream, args, argFiles);
 
     String substitutedPig = pigOStream.toString();
-    System.out.println(substitutedPig);
+    LOG.info(substitutedPig);
 
     File f = File.createTempFile("tmp", "pigunit");
     PrintWriter pw = new PrintWriter(f);

Modified: pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java
URL: http://svn.apache.org/viewvc/pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java?rev=1025789&r1=1025788&r2=1025789&view=diff
==============================================================================
--- pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java (original)
+++ pig/trunk/test/org/apache/pig/test/pigunit/TestPigTest.java Wed Oct 20 23:08:39 2010
@@ -34,7 +34,7 @@ import org.junit.Test;
  * <ul>
  *   <li>pig.jar</li>
  *   <li>pigunit.jar</li>
- *   <li>hadoop_conf_dir to current/future cluster if not using LOCAL mode</li>
+ *   <li>$HADOOP_CONF_DIR to current/future cluster if not using LOCAL mode</li>
  * </ul>
  */
 public class TestPigTest {