You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by am...@apache.org on 2010/09/13 07:30:29 UTC

svn commit: r996420 - in /hadoop/mapreduce/trunk: ./ src/docs/src/documentation/content/xdocs/ src/tools/org/apache/hadoop/tools/rumen/

Author: amareshwari
Date: Mon Sep 13 05:30:29 2010
New Revision: 996420

URL: http://svn.apache.org/viewvc?rev=996420&view=rev
Log:
MAPREDUCE-1918. Adds documentation to Rumen. Contributed by Amar Kamat

Added:
    hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java
Modified:
    hadoop/mapreduce/trunk/CHANGES.txt
    hadoop/mapreduce/trunk/build.xml
    hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java
    hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java

Modified: hadoop/mapreduce/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/CHANGES.txt (original)
+++ hadoop/mapreduce/trunk/CHANGES.txt Mon Sep 13 05:30:29 2010
@@ -293,6 +293,8 @@ Trunk (unreleased changes)
     when gridmix.output.directory is not defined. (Ravi Gummadi via
     amareshwari)
 
+    MAPREDUCE-1918. Adds documentation to Rumen. (Amar Kamat via amareshwari)
+
 Release 0.21.0 - Unreleased
 
   INCOMPATIBLE CHANGES

Modified: hadoop/mapreduce/trunk/build.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/build.xml?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/build.xml (original)
+++ hadoop/mapreduce/trunk/build.xml Mon Sep 13 05:30:29 2010
@@ -945,6 +945,7 @@
       >
         <packageset dir="${mapred.src.dir}"/>
     	<packageset dir="${examples.dir}"/>
+    	<packageset dir="${tools.src}"/>
 
        <packageset dir="src/contrib/data_join/src/java"/>
        <packageset dir="src/contrib/gridmix/src/java"/>
@@ -967,6 +968,7 @@
 
        <group title="Packages" packages="org.apache.*"/>
        <group title="Libraries" packages="org.apache.hadoop.mapred.lib*:org.apache.hadoop.mapreduce.lib*"/>
+       <group title="Tools" packages="org.apache.hadoop.tools"/>
        <group title="Examples" packages="org.apache.hadoop.examples*"/>
 
        <group title="contrib: DataJoin" packages="org.apache.hadoop.contrib.utils.join*"/>
@@ -1011,6 +1013,7 @@
        <packageset dir="src/contrib/gridmix/src/java"/>
        <packageset dir="src/contrib/index/src/java"/>
        <packageset dir="src/contrib/streaming/src/java"/>
+       <packageset dir="${tools.src}"/>
 	
         <link href="${javadoc.link.java}"/>
 
@@ -1027,6 +1030,7 @@
 
        <group title="Packages" packages="org.apache.*"/>
        <group title="Libraries" packages="org.apache.hadoop.mapred.lib*:org.apache.hadoop.mapreduce.lib*"/>
+       <group title="Tools" packages="org.apache.hadoop.tools"/>
        <group title="Examples" packages="org.apache.hadoop.examples*"/>
 
        <group title="contrib: DataJoin" packages="org.apache.hadoop.contrib.utils.join*"/>
@@ -1048,6 +1052,7 @@
        </doclet>
        <packageset dir="src/java"/>
        <packageset dir="src/tools"/>
+       <packageset dir="${tools.src}"/>
        <classpath >
          <path refid="classpath" />
          <path refid="jdiff-classpath" />
@@ -1081,6 +1086,7 @@
        </doclet>
        <packageset dir="src/java"/>
        <packageset dir="src/tools"/>
+       <packageset dir="${tools.src}"/>
        <classpath >
          <path refid="classpath" />
          <path refid="jdiff-classpath"/>

Added: hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml?rev=996420&view=auto
==============================================================================
--- hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml (added)
+++ hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml Mon Sep 13 05:30:29 2010
@@ -0,0 +1,442 @@
+<?xml version="1.0"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+
+<header>
+  <title>Rumen</title>
+</header>
+
+<body>
+  <!--
+    Overview [What is Rumen and why is it needed?]
+  -->
+  <section id="overview">
+    <title>Overview</title>
+    
+    <p><em>Rumen</em> is a data extraction and analysis tool built for
+       <em>Apache Hadoop</em>. <em>Rumen</em> mines <em>JobHistory</em> logs to 
+       extract meaningful data and stores it in an easily-parsed, condensed 
+       format or <em>digest</em>. The raw trace data from MapReduce logs are 
+       often insufficient for simulation, emulation, and benchmarking, as these 
+       tools often attempt to measure conditions that did not occur in the 
+       source data. For example, if a task ran locally in the raw trace data 
+       but a simulation of the scheduler elects to run that task on a remote 
+       rack, the simulator requires a runtime its input cannot provide. 
+       To fill in these gaps, Rumen performs a statistical analysis of the 
+       digest to estimate the variables the trace doesn't supply. Rumen traces 
+       drive both Gridmix (a benchmark of Hadoop MapReduce clusters) and Mumak 
+       (a simulator for the JobTracker).
+    </p>
+
+    <!--
+      Why is Rumen needed?
+    --> 
+    <section>     
+      <title>Motivation</title>
+      
+      <ul>
+        <li>Extracting meaningful data from <em>JobHistory</em> logs is a common
+            task for any tool built to work on <em>MapReduce</em>. It 
+            is tedious to write a custom tool which is so tightly coupled with 
+            the <em>MapReduce</em> framework. Hence there is a need for a 
+            built-in tool for performing framework level task of log parsing and
+            analysis. Such a tool would insulate external systems depending on 
+            job history against the changes made to the job history format.
+        </li>
+        <li>Performing statistical analysis of various attributes of a 
+            <em>MapReduce Job</em> such as <em>task runtimes, task failures 
+            etc</em> is another common task that the benchmarking 
+            and simulation tools might need. <em>Rumen</em> generates 
+            <a href="http://en.wikipedia.org/wiki/Cumulative_distribution_function">
+              <em>Cumulative Distribution Functions (CDF)</em>
+            </a> for the Map/Reduce task runtimes. 
+            Runtime CDF can be used for extrapolating the task runtime of 
+            incomplete, missing and synthetic tasks. Similarly CDF is also 
+            computed for the total number of successful tasks for every attempt.
+            
+        </li>
+      </ul>
+    </section>
+
+    <!--
+      Basic high level view of components
+    -->
+    <section>  
+      <title>Components</title>
+      
+      <p><em>Rumen</em> consists of 2 components</p>
+      
+      <ul>
+        <li><em>Trace Builder</em> : 
+            Converts <em>JobHistory</em> logs into an easily-parsed format.
+            Currently <code>TraceBuilder</code> outputs the trace in 
+            <a href="http://www.json.org/"><em>JSON</em></a> 
+            format.   
+        </li>
+        <li><em>Folder </em>: 
+            A utility to scale the input trace. A trace obtained from
+            <em>TraceBuilder</em> simply summarizes the jobs in the 
+            input folders and files. The time-span within which all the jobs in 
+            a given trace finish can be considered as the trace runtime. 
+            <em>Folder</em> can be used to scale the runtime of a trace.
+            Decreasing the trace runtime might involve dropping some jobs from 
+            the input trace and scaling down the runtime of remaining jobs. 
+            Increasing the trace runtime might involve adding some dummy jobs to
+            the resulting trace and scaling up the runtime of individual jobs.
+       </li>
+                 
+      </ul>
+      <p></p><p></p><p></p>
+    </section>
+  </section>    
+
+  <!--
+    Usage [How to run Rumen? What are the various configuration parameters?]
+  -->
+  <section id="usage">
+    <title>How to use <em>Rumen</em>?</title>
+    
+    <p>Converting <em>JobHistory</em> logs into a desired job-trace consists of 
+       2 steps</p>
+    <ol>
+      <li>Extracting information into an intermediate format</li>
+      <li>Adjusting the job-trace obtained from the intermediate trace to 
+          have the desired properties.</li>
+    </ol>
+       
+    <note>Extracting information from <em>JobHistory</em> logs is a one time
+          operation. This so called <em>Gold Trace</em> can be reused to
+          generate traces with desired values of properties such as 
+          <code>output-duration</code>, <code>concentration</code> etc.
+    </note>
+       
+    <p><em>Rumen</em> provides 2 basic commands</p>
+     <ul>
+       <li><code>TraceBuilder</code></li>
+       <li><code>Folder</code></li>
+     </ul>
+       
+    <p>Firstly, we need to generate the <em>Gold Trace</em>. Hence the first 
+       step is to run <code>TraceBuilder</code> on a job-history folder. 
+       The output of the <code>TraceBuilder</code> is a job-trace file (and an 
+       optional cluster-topology file). In case we want to scale the output, we 
+       can use the <code>Folder</code> utility to fold the current trace to the 
+       desired length. The remaining part of this section explains these 
+       utilities in detail.
+    </p>
+    
+    <note>Examples in this section assumes that certain libraries are present 
+          in the java CLASSPATH. See <em>Section-3.2</em> for more details.
+    </note>
+    <!--
+     TraceBuilder command
+    -->
+    <section>
+      <title>Trace Builder</title>
+      
+      <p><code>Command:</code></p>
+      <source>java org.apache.hadoop.tools.rumen.TraceBuilder [options] [jobtrace-output] [topology-output] [input]</source>
+      
+      <p>This command invokes the <code>TraceBuilder</code> utility of 
+         <em>Rumen</em>. It converts the JobHistory files into a series of JSON 
+         objects and output them in the <code>[jobtrace-output]</code> file. 
+         It also extracts the cluster layout (topology) and outputs it in the
+         <code>[topology-output]</code> file. 
+         <code>[input]</code> represents a space separated list of JobHistory 
+         files and folders.
+      </p>
+         
+         <note>1) Input and output to <code>TraceBuilder</code> is expected to
+               be a fully qualified FileSystem path. So use '<em>file://</em>' 
+               to specify files on the <code>local</code> FileSystem and 
+               '<em>hdfs://</em>' to specify files on HDFS. Since input files or
+               folder are FileSystem paths, it means that they can be globbed.
+               This can be useful while specifying multiple file paths using
+               regular expressions.
+         </note>
+         <note>
+               2) TraceBuilder does not recursively scan the input folder for
+               job history files. Only the files that are directly placed under 
+               the input folder will be considered for generating the trace.
+         </note>
+      
+      <p>Cluster topology is used as follows :</p>
+      <ul>
+        <li>To reconstruct the splits and make sure that the 
+            distances/latencies seen in the actual run are modeled correctly.
+        </li>
+        <li>To extrapolate splits information for tasks with missing splits
+            details or synthetically generated tasks.
+        </li>
+      </ul>
+      
+      <p><code>Options :</code></p>
+      <table>
+        <tr>
+          <th> Parameter</th>
+          <th> Description</th>
+          <th> Notes </th>
+        </tr>
+        <tr>
+          <td><code>-demuxer</code></td>
+          <td>Used to read the jobhistory files. The default is 
+              <code>DefaultInputDemuxer</code>.</td>
+          <td>Demuxer decides how the input file maps to jobhistory file(s). 
+              Job history logs and job configuration files are typically small 
+              files, and can be more effectively stored when embedded in some
+              container file format like SequenceFile or TFile. To support such 
+              usage cases, one can specify a customized Demuxer class that can 
+              extract individual job history logs and job configuration files 
+              from the source files.
+          </td>
+        </tr>
+      </table>
+      
+      <section>
+        <title>Example</title>
+        <source>java org.apache.hadoop.tools.rumen.TraceBuilder file:///home/user/job-trace.json file:///home/user/topology.output file:///home/user/logs/history/done</source>
+        <p></p>
+        <p>This will analyze all the jobs in 
+         <code>/home/user/logs/history/done</code> stored on the 
+         <code>local</code> FileSystem and output the jobtraces in 
+         <code>/home/user/job-trace.json</code> along with topology 
+         information in <code>/home/user/topology.output</code>.
+        </p>
+      </section>
+      <p></p><p></p><p></p><p></p><p></p><p></p>
+    </section>
+
+  <!--
+   Folder command
+  -->
+  <section>
+      <title>Folder</title>
+      
+      <p><code>Command</code>:</p>
+      <source>java org.apache.hadoop.tools.rumen.Folder [options] [input] [output]</source>
+      
+      <note>Input and output to <code>Folder</code> is expected to be a fully 
+            qualified FileSystem path. So use '<em>file://</em>' to specify 
+            files on the <code>local</code> FileSystem and '<em>hdfs://</em>' to
+            specify files on HDFS.
+         </note>
+      
+      <p>This command invokes the <code>Folder</code> utility of 
+         <em>Rumen</em>. Folding essentially means that the output duration of 
+         the resulting trace is fixed and job timelines are adjusted 
+         to respect the final output duration. 
+      </p>
+      
+      <p></p>
+      <p><code>Options :</code></p>
+      <table>
+        <tr>
+          <th> Parameter</th>
+          <th> Description</th>
+          <th> Notes </th>
+        </tr>
+        <tr>
+          <td><code>-input-cycle</code></td>
+          <td>Defines the basic unit of time for the folding operation. There is
+              no default value for <code>input-cycle</code>. 
+              <strong>Input cycle must be provided</strong>.
+          </td>
+          <td>'<code>-input-cycle 10m</code>' 
+              implies that the whole trace run will be now sliced at a 10min 
+              interval. Basic operations will be done on the 10m chunks. Note 
+              that <em>Rumen</em> understands various time units like 
+              <em>m(min), h(hour), d(days) etc</em>.
+          </td>
+        </tr>
+        <tr>
+          <td><code>-output-duration</code></td>
+          <td>This parameter defines the final runtime of the trace. 
+              Default value if <strong>1 hour</strong>.
+          </td>
+          <td>'<code>-output-duration 30m</code>' 
+              implies that the resulting trace will have a max runtime of 
+              30mins. All the jobs in the input trace file will be folded and 
+              scaled to fit this window.
+          </td>
+        </tr>
+        <tr>
+          <td><code>-concentration</code></td>
+          <td>Set the concentration of the resulting trace. Default value is 
+              <strong>1</strong>.
+          </td>
+          <td>If the total runtime of the resulting trace is less than the total
+              runtime of the input trace, then the resulting trace would contain
+              lesser number of jobs as compared to the input trace. This 
+              essentially means that the output is diluted. To increase the 
+              density of jobs, set the concentration to a higher value.</td>
+        </tr>
+        <tr>
+          <td><code>-debug</code></td>
+          <td>Run the Folder in debug mode. By default it is set to 
+              <strong>false</strong>.</td>
+          <td>In debug mode, the Folder will print additional statements for 
+              debugging. Also the intermediate files generated in the scratch 
+              directory will not be cleaned up.
+          </td>
+        </tr>
+        <tr>
+          <td><code>-seed</code></td>
+          <td>Initial seed to the Random Number Generator. By default, a Random 
+              Number Generator is used to generate a seed and the seed value is
+              reported back to the user for future use.
+          </td>
+          <td>If an initial seed is passed, then the <code>Random Number 
+              Generator</code> will generate the random numbers in the same 
+              sequence i.e the sequence of random numbers remains same if the 
+              same seed is used. Folder uses Random Number Generator to decide 
+              whether or not to emit the job. 
+          </td>
+        </tr>
+        <tr>
+          <td><code>-temp-directory</code></td>
+          <td>Temporary directory for the Folder. By default the <strong>output
+              folder's parent directory</strong> is used as the scratch space.
+          </td>
+          <td>This is the scratch space used by Folder.  All the 
+              temporary files are cleaned up in the end unless the Folder is run
+              in <code>debug</code> mode.</td>
+        </tr>
+        <tr>
+          <td><code>-skew-buffer-length</code></td>
+          <td>Enables <em>Folder</em> to tolerate skewed jobs.
+              The default buffer length is <strong>0</strong>.</td>
+          <td>'<code>-skew-buffer-length 100</code>' 
+              indicates that if the jobs appear out of order within a window 
+              size of 100, then they will be emitted in-order by the folder. 
+              If a job appears out-of-order outside this window, then the Folder
+              will bail out provided <code>-allow-missorting</code> is not set.
+              <em>Folder</em> reports the maximum skew size seen in the 
+              input trace for future use.
+          </td>
+        </tr>
+        <tr>
+          <td><code>-allow-missorting</code></td>
+          <td>Enables <em>Folder</em> to tolerate out-of-order jobs. By default 
+              mis-sorting is not allowed.
+          </td>
+          <td>If mis-sorting is allowed, then the <em>Folder</em> will ignore 
+              out-of-order jobs that cannot be deskewed using a skew buffer of
+              size specified using <code>-skew-buffer-length</code>. If 
+              mis-sorting is not allowed, then the Folder will bail out if the
+              skew buffer is incapable of tolerating the skew.
+          </td>
+        </tr>
+      </table>
+      
+      <section>
+      <title>Examples</title>
+      <section>
+        <title>Folding an input trace with 10 hours of total runtime to 
+               generate an output trace with 1 hour of total runtime</title>
+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h  -input-cycle 20m  file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+        <p></p>
+        <p>If the folded jobs are out of order then the command
+          will bail out. 
+        </p>
+        <p>
+        
+        </p>
+      </section>
+      
+      <section>
+        <title>Folding an input trace with 10 hours of total runtime to 
+               generate an output trace with 1 hour of total runtime and 
+               tolerate some skewness
+        </title>
+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m  -allow-missorting -skew-buffer-length 100 file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+        <p></p>
+        <p>If the folded jobs are out of order, then atmost
+          100 jobs will be de-skewed. If the 101<sup>st</sup> job is 
+          <em>out-of-order</em>, then the command will bail out.
+        </p>
+      </section>
+      <section>
+        <title>Folding an input trace with 10 hours of total runtime to 
+               generate an output trace with 1 hour of total runtime in debug 
+               mode
+        </title>
+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m  -debug -temp-directory file:///tmp/debug file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+        <p></p>
+        <p>This will fold the 10hr job-trace file 
+           <code>file:///home/user/job-trace.json</code> to finish within 1hr 
+           and use <code>file:///tmp/debug</code> as the temporary directory. 
+           The intermediate files in the temporary directory will not be cleaned
+           up.
+        </p>
+      </section>
+      
+      <section>
+        <title>Folding an input trace with 10 hours of total runtime to 
+               generate an output trace with 1 hour of total runtime with custom
+               concentration.
+        </title>
+        <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m  -concentration 2  file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+        <p></p>
+        <p>This will fold the 10hr job-trace file 
+           <code>file:///home/user/job-trace.json</code> to finish within 1hr 
+           with concentration of 2. <code>Example-2.3.2</code> will retain 10% 
+           of the jobs. With <em>concentration</em> as 2, 20% of the total input 
+           jobs will be retained.
+        </p>
+      </section>
+    </section>
+    </section>
+    <p></p><p></p><p></p>
+  </section>
+  
+  <!--
+    Appendix [Resources i.e ppts, jiras, definition etc]
+  -->
+  <section>
+    <title>Appendix</title>
+    
+    <section>
+      <title>Resources</title>
+      <p><a href="https://issues.apache.org/jira/browse/MAPREDUCE-751">MAPREDUCE-751</a> is the main JIRA that introduced <em>Rumen</em> to <em>MapReduce</em>. 
+         Look at the MapReduce <a href="https://issues.apache.org/jira/browse/MAPREDUCE/component/12313617">rumen-component</a> for further details.</p>
+    </section>
+    
+    <section>
+     <title>Dependencies</title>
+    <p><em>Rumen</em> expects certain library <em>JARs</em> to be present in 
+         the <em>CLASSPATH</em>. 
+              The required libraries are </p>
+      <ul>
+        <li><code>Hadoop MapReduce Tools</code> (<code>hadoop-mapred-tools-{hadoop-version}.jar</code>)</li>
+        <li><code>Hadoop Common</code> (<code>hadoop-common-{hadoop-version}.jar</code>)</li>
+        <li><code>Apache Commons Logging</code> (<code>commons-logging-1.1.1.jar</code>)</li>
+        <li><code>Apache Commons CLI</code> (<code>commons-cli-1.2.jar</code>)</li>
+        <li><code>Jackson Mapper</code> (<code>jackson-mapper-asl-1.4.2.jar</code>)</li>
+        <li><code>Jackson Core</code> (<code>jackson-core-asl-1.4.2.jar</code>)</li>
+      </ul>
+      
+      <note>One simple way to run Rumen is to use '$HADOOP_HOME/bin/hadoop jar' 
+              option  to run it.
+      </note>
+    </section>
+  </section>
+</body>
+</document>

Modified: hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml (original)
+++ hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml Mon Sep 13 05:30:29 2010
@@ -43,6 +43,7 @@ See http://forrest.apache.org/docs/linki
 		<vaidya    					label="Vaidya" 		href="vaidya.html"/>
 		<archives  				label="Hadoop Archives"     href="hadoop_archives.html"/>
 		<gridmix  				label="Gridmix"     href="gridmix.html"/>
+		<Rumen  				label="Rumen"     href="rumen.html"/>
    </docs>
    
     <docs label="Schedulers">

Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java Mon Sep 13 05:30:29 2010
@@ -54,13 +54,13 @@ public interface ClusterStory {
   /**
    * Get {@link MachineNode} by its host name.
    * 
-   * @return The {@line MachineNode} with the same name. Or null if not found.
+   * @return The {@link MachineNode} with the same name. Or null if not found.
    */
   public MachineNode getMachineByName(String name);
   
   /**
    * Get {@link RackNode} by its name.
-   * @return The {@line RackNode} with the same name. Or null if not found.
+   * @return The {@link RackNode} with the same name. Or null if not found.
    */
   public RackNode getRackByName(String name);
 

Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java Mon Sep 13 05:30:29 2010
@@ -72,7 +72,7 @@ public class DeskewedJobTraceReader impl
    * 
    * @param reader
    *          the {@link JobTraceReader} that's being protected
-   * @param skewBufferSize
+   * @param skewBufferLength
    *          [the number of late jobs that can preced a later out-of-order
    *          earlier job
    * @throws IOException

Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java Mon Sep 13 05:30:29 2010
@@ -38,7 +38,7 @@ public class JobHistoryParserFactory {
     throw new IOException("No suitable parser.");
   }
 
-  enum VersionDetector {
+  public enum VersionDetector {
     Hadoop20() {
 
       @Override

Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java Mon Sep 13 05:30:29 2010
@@ -24,7 +24,7 @@ import java.util.TreeSet;
 
 /**
  * {@link Node} represents a node in the cluster topology. A node can be a
- * {@MachineNode}, or a {@link RackNode}, etc.
+ * {@link MachineNode}, or a {@link RackNode}, etc.
  */
 public class Node implements Comparable<Node> {
   private static final SortedSet<Node> EMPTY_SET = 

Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java Mon Sep 13 05:30:29 2010
@@ -17,7 +17,6 @@
  */
 package org.apache.hadoop.tools.rumen;
 
-import org.apache.hadoop.mapred.TaskStatus;
 import org.apache.hadoop.mapred.TaskStatus.State;
 
 /**
@@ -38,7 +37,7 @@ public abstract class TaskAttemptInfo {
   }
 
   /**
-   * Get the final {@link TaskStatus.State} of the task-attempt.
+   * Get the final {@link State} of the task-attempt.
    * 
    * @return the final <code>State</code> of the task-attempt
    */

Added: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java?rev=996420&view=auto
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java (added)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java Mon Sep 13 05:30:29 2010
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Rumen is a data extraction and analysis tool built for 
+ * <a href="http://hadoop.apache.org/">Apache Hadoop</a>. Rumen mines job history
+ * logs to extract meaningful data and stores it into an easily-parsed format.
+ * 
+ * The default output format of Rumen is <a href="http://www.json.org">JSON</a>.
+ * Rumen uses the <a href="http://jackson.codehaus.org/">Jackson</a> library to 
+ * create JSON objects.
+ * <br><br>
+ * 
+ * The following classes can be used to programmatically invoke Rumen:
+ * <ol>
+ *   <li>
+ *    {@link org.apache.hadoop.tools.rumen.JobConfigurationParser}<br>
+ *      A parser to parse and filter out interesting properties from job 
+ *      configuration.
+ *      
+ *      <br><br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // An example to parse and filter out job name
+ *        
+ *        String conf_filename = .. // assume the job configuration filename here
+ *        
+ *        // construct a list of interesting properties
+ *        List<String> interestedProperties = new ArrayList<String>();
+ *        interestedProperties.add("mapreduce.job.name");
+ *        
+ *        JobConfigurationParser jcp = 
+ *          new JobConfigurationParser(interestedProperties);
+ *
+ *        InputStream in = new FileInputStream(conf_filename);
+ *        Properties parsedProperties = jcp.parse(in);
+ *     </code>
+ *     </pre>
+ *     Some of the commonly used interesting properties are enumerated in 
+ *     {@link org.apache.hadoop.tools.rumen.JobConfPropertyNames}. <br><br>
+ *     
+ *     <b>Note:</b>
+ *        A single instance of {@link org.apache.hadoop.tools.rumen.JobConfigurationParser} 
+ *        can be used to parse multiple job configuration files. 
+ *     
+ *   </li>
+ *   <li>
+ *    {@link org.apache.hadoop.tools.rumen.JobHistoryParser} <br>
+ *      A parser that parses job history files. It is an interface and actual 
+ *      implementations are defined as Enum in 
+ *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory}. Note that
+ *      {@link org.apache.hadoop.tools.rumen.RewindableInputStream}<br>
+ *      is a wrapper class around {@link java.io.InputStream} to make the input 
+ *      stream rewindable.
+ *      
+ *      <br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // An example to parse a current job history file i.e a job history 
+ *        // file for which the version is known
+ *        
+ *        String filename = .. // assume the job history filename here
+ *        
+ *        InputStream in = new FileInputStream(filename);
+ *        
+ *        HistoryEvent event = null;
+ *        
+ *        JobHistoryParser parser = new CurrentJHParser(in);
+ *        
+ *        event = parser.nextEvent();
+ *        // process all the events
+ *        while (event != null) {
+ *          // ... process all event
+ *          event = parser.nextEvent();
+ *        }
+ *        
+ *        // close the parser and the underlying stream
+ *        parser.close();
+ *      </code>
+ *      </pre>
+ *      
+ *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory} provides a 
+ *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory#getParser(org.apache.hadoop.tools.rumen.RewindableInputStream)}
+ *      API to get a parser for parsing the job history file. Note that this
+ *      API can be used if the job history version is unknown.<br><br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // An example to parse a job history for which the version is not 
+ *        // known i.e using JobHistoryParserFactory.getParser()
+ *        
+ *        String filename = .. // assume the job history filename here
+ *        
+ *        InputStream in = new FileInputStream(filename);
+ *        RewindableInputStream ris = new RewindableInputStream(in);
+ *        
+ *        // JobHistoryParserFactory will check and return a parser that can
+ *        // parse the file
+ *        JobHistoryParser parser = JobHistoryParserFactory.getParser(ris);
+ *        
+ *        // now use the parser to parse the events
+ *        HistoryEvent event = parser.nextEvent();
+ *        while (event != null) {
+ *          // ... process the event
+ *          event = parser.nextEvent();
+ *        }
+ *        
+ *        parser.close();
+ *      </code>
+ *      </pre>
+ *      <b>Note:</b>
+ *        Create one instance to parse a job history log and close it after use.
+ *  </li>
+ *  <li>
+ *    {@link org.apache.hadoop.tools.rumen.TopologyBuilder}<br>
+ *      Builds the cluster topology based on the job history events. Every 
+ *      job history file consists of events. Each event can be represented using
+ *      {@link org.apache.hadoop.mapreduce.jobhistory.HistoryEvent}. 
+ *      These events can be passed to {@link org.apache.hadoop.tools.rumen.TopologyBuilder} using 
+ *      {@link org.apache.hadoop.tools.rumen.TopologyBuilder#process(org.apache.hadoop.mapreduce.jobhistory.HistoryEvent)}.
+ *      A cluster topology can be represented using {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}.
+ *      Once all the job history events are processed, the cluster 
+ *      topology can be obtained using {@link org.apache.hadoop.tools.rumen.TopologyBuilder#build()}.
+ *      
+ *      <br><br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // Building topology for a job history file represented using 
+ *        // 'filename' and the corresponding configuration file represented 
+ *        // using 'conf_filename'
+ *        String filename = .. // assume the job history filename here
+ *        String conf_filename = .. // assume the job configuration filename here
+ *        
+ *        InputStream jobConfInputStream = new FileInputStream(filename);
+ *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
+ *        
+ *        TopologyBuilder tb = new TopologyBuilder();
+ *        
+ *        // construct a list of interesting properties
+ *        List<String> interestingProperties = new ArrayList<Strng>();
+ *        // add the interesting properties here
+ *        interestingProperties.add("mapreduce.job.name");
+ *        
+ *        JobConfigurationParser jcp = 
+ *          new JobConfigurationParser(interestingProperties);
+ *        
+ *        // parse the configuration file
+ *        tb.process(jcp.parse(jobConfInputStream));
+ *        
+ *        // read the job history file and pass it to the 
+ *        // TopologyBuilder.
+ *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
+ *        HistoryEvent e;
+ *        
+ *        // read and process all the job history events
+ *        while ((e = parser.nextEvent()) != null) {
+ *          tb.process(e);
+ *        }
+ *        
+ *        LoggedNetworkTopology topology = tb.build();
+ *      </code>
+ *      </pre>
+ *  </li>
+ *  <li>
+ *    {@link org.apache.hadoop.tools.rumen.JobBuilder}<br>
+ *      Summarizes a job history file.
+ *      {@link org.apache.hadoop.tools.rumen.TraceBuilder} provides  
+ *      {@link org.apache.hadoop.tools.rumen.TraceBuilder#extractJobID(String)} 
+ *      API for extracting job id from job history or job configuration files
+ *      which can be used for instantiating {@link org.apache.hadoop.tools.rumen.JobBuilder}. 
+ *      {@link org.apache.hadoop.tools.rumen.JobBuilder} generates a 
+ *      {@link org.apache.hadoop.tools.rumen.LoggedJob} object via 
+ *      {@link org.apache.hadoop.tools.rumen.JobBuilder#build()}. 
+ *      See {@link org.apache.hadoop.tools.rumen.LoggedJob} for more details.
+ *      
+ *      <br><br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // An example to summarize a current job history file 'filename'
+ *        // and the corresponding configuration file 'conf_filename'
+ *        
+ *        String filename = .. // assume the job history filename here
+ *        String conf_filename = .. // assume the job configuration filename here
+ *        
+ *        InputStream jobConfInputStream = new FileInputStream(job_filename);
+ *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
+ *        
+ *        String jobID = TraceBuilder.extractJobID(job_filename);
+ *        JobBuilder jb = new JobBuilder(jobID);
+ *        
+ *        // construct a list of interesting properties
+ *        List<String> interestingProperties = new ArrayList<Strng>();
+ *        // add the interesting properties here
+ *        interestingProperties.add("mapreduce.job.name");
+ *        
+ *        JobConfigurationParser jcp = 
+ *          new JobConfigurationParser(interestingProperties);
+ *        
+ *        // parse the configuration file
+ *        jb.process(jcp.parse(jobConfInputStream));
+ *        
+ *        // parse the job history file
+ *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
+ *        try {
+ *          HistoryEvent e;
+ *          // read and process all the job history events
+ *          while ((e = parser.nextEvent()) != null) {
+ *            jobBuilder.process(e);
+ *          }
+ *        } finally {
+ *          parser.close();
+ *        }
+ *        
+ *        LoggedJob job = jb.build();
+ *      </code>
+ *      </pre>
+ *     <b>Note:</b>
+ *       The order of parsing the job configuration file or job history file is 
+ *       not important. Create one instance to parse the history file and job 
+ *       configuration.
+ *   </li>
+ *   <li>
+ *    {@link org.apache.hadoop.tools.rumen.DefaultOutputter}<br>
+ *      Implements {@link org.apache.hadoop.tools.rumen.Outputter} and writes 
+ *      JSON object in text format to the output file. 
+ *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter} can be 
+ *      initialized with the output filename.
+ *      
+ *      <br><br>
+ *      <i>Sample code</i>:  
+ *      <pre>
+ *      <code>
+ *        // An example to summarize a current job history file represented by
+ *        // 'filename' and the configuration filename represented using 
+ *        // 'conf_filename'. Also output the job summary to 'out.json' along 
+ *        // with the cluster topology to 'topology.json'.
+ *        
+ *        String filename = .. // assume the job history filename here
+ *        String conf_filename = .. // assume the job configuration filename here
+ *        
+ *        Configuration conf = new Configuration();
+ *        DefaultOutputter do = new DefaultOutputter();
+ *        do.init("out.json", conf);
+ *        
+ *        InputStream jobConfInputStream = new FileInputStream(filename);
+ *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
+ *        
+ *        // extract the job-id from the filename
+ *        String jobID = TraceBuilder.extractJobID(filename);
+ *        JobBuilder jb = new JobBuilder(jobID);
+ *        TopologyBuilder tb = new TopologyBuilder();
+ *        
+ *        // construct a list of interesting properties
+ *        List<String> interestingProperties = new ArrayList<Strng>();
+ *        // add the interesting properties here
+ *        interestingProperties.add("mapreduce.job.name");
+ *        
+ *        JobConfigurationParser jcp =
+ *          new JobConfigurationParser(interestingProperties);
+ *          
+ *        // parse the configuration file
+ *        tb.process(jcp.parse(jobConfInputStream));
+ *        
+ *        // read the job history file and pass it to the
+ *        // TopologyBuilder.
+ *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
+ *        HistoryEvent e;
+ *        while ((e = parser.nextEvent()) != null) {
+ *          jb.process(e);
+ *          tb.process(e);
+ *        }
+ *        
+ *        LoggedJob j = jb.build();
+ *        
+ *        // serialize the job summary in json (text) format
+ *        do.output(j);
+ *        
+ *        // close
+ *        do.close();
+ *        
+ *        do.init("topology.json", conf);
+ *        
+ *        // get the job summary using TopologyBuilder
+ *        LoggedNetworkTopology topology = topologyBuilder.build();
+ *        
+ *        // serialize the cluster topology in json (text) format
+ *        do.output(topology);
+ *        
+ *        // close
+ *        do.close();
+ *      </code>
+ *      </pre>
+ *   </li>
+ *   <li>
+ *    {@link org.apache.hadoop.tools.rumen.JobTraceReader}<br>
+ *      A reader for reading {@link org.apache.hadoop.tools.rumen.LoggedJob} serialized using 
+ *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.LoggedJob} 
+ *      provides various APIs for extracting job details. Following are the most
+ *      commonly used ones
+ *        <ul>
+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getMapTasks()} : Get the map tasks</li>
+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getReduceTasks()} : Get the reduce tasks</li>
+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getOtherTasks()} : Get the setup/cleanup tasks</li>
+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getOutcome()} : Get the job's outcome</li>
+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getSubmitTime()} : Get the job's submit time</li>
+ *          <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getFinishTime()} : Get the job's finish time</li>
+ *        </ul>
+ *        
+ *      <br><br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // An example to read job summary from a trace file 'out.json'.
+ *        JobTraceReader reader = new JobTracerReader("out.json");
+ *        LoggedJob job = reader.getNext();
+ *        while (job != null) {
+ *          // .... process job level information
+ *          for (LoggedTask task : job.getMapTasks()) {
+ *            // process all the map tasks in the job
+ *            for (LoggedTaskAttempt attempt : task.getAttempts()) {
+ *              // process all the map task attempts in the job
+ *            }
+ *          }
+ *          
+ *          // get the next job
+ *          job = reader.getNext();
+ *        }
+ *        reader.close();
+ *      </code>
+ *      </pre>         
+ *   </li>
+ *   <li>
+ *    {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader}<br>
+ *      A reader to read {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology} serialized using 
+ *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader} can be 
+ *      initialized using the serialized topology filename. 
+ *      {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader#get()} can
+ *      be used to get the 
+ *      {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}. 
+ *      
+ *      <br><br>
+ *      <i>Sample code</i>:
+ *      <pre>
+ *      <code>
+ *        // An example to read the cluster topology from a topology output file
+ *        // 'topology.json'
+ *        ClusterTopologyReader reader = new ClusterTopologyReader("topology.json");
+ *        LoggedNetworkTopology topology  = reader.get();
+ *        for (LoggedNetworkTopology t : topology.getChildren()) {
+ *          // process the cluster topology
+ *        }
+ *        reader.close();
+ *      </code>
+ *      </pre>
+ *   </li>
+ * </ol>     
+ */
+
+package org.apache.hadoop.tools.rumen;
\ No newline at end of file