You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by am...@apache.org on 2010/09/13 07:30:29 UTC
svn commit: r996420 - in /hadoop/mapreduce/trunk: ./
src/docs/src/documentation/content/xdocs/
src/tools/org/apache/hadoop/tools/rumen/
Author: amareshwari
Date: Mon Sep 13 05:30:29 2010
New Revision: 996420
URL: http://svn.apache.org/viewvc?rev=996420&view=rev
Log:
MAPREDUCE-1918. Adds documentation to Rumen. Contributed by Amar Kamat
Added:
hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml
hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java
Modified:
hadoop/mapreduce/trunk/CHANGES.txt
hadoop/mapreduce/trunk/build.xml
hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml
hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java
hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java
hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java
hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java
hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java
Modified: hadoop/mapreduce/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/CHANGES.txt?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/CHANGES.txt (original)
+++ hadoop/mapreduce/trunk/CHANGES.txt Mon Sep 13 05:30:29 2010
@@ -293,6 +293,8 @@ Trunk (unreleased changes)
when gridmix.output.directory is not defined. (Ravi Gummadi via
amareshwari)
+ MAPREDUCE-1918. Adds documentation to Rumen. (Amar Kamat via amareshwari)
+
Release 0.21.0 - Unreleased
INCOMPATIBLE CHANGES
Modified: hadoop/mapreduce/trunk/build.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/build.xml?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/build.xml (original)
+++ hadoop/mapreduce/trunk/build.xml Mon Sep 13 05:30:29 2010
@@ -945,6 +945,7 @@
>
<packageset dir="${mapred.src.dir}"/>
<packageset dir="${examples.dir}"/>
+ <packageset dir="${tools.src}"/>
<packageset dir="src/contrib/data_join/src/java"/>
<packageset dir="src/contrib/gridmix/src/java"/>
@@ -967,6 +968,7 @@
<group title="Packages" packages="org.apache.*"/>
<group title="Libraries" packages="org.apache.hadoop.mapred.lib*:org.apache.hadoop.mapreduce.lib*"/>
+ <group title="Tools" packages="org.apache.hadoop.tools"/>
<group title="Examples" packages="org.apache.hadoop.examples*"/>
<group title="contrib: DataJoin" packages="org.apache.hadoop.contrib.utils.join*"/>
@@ -1011,6 +1013,7 @@
<packageset dir="src/contrib/gridmix/src/java"/>
<packageset dir="src/contrib/index/src/java"/>
<packageset dir="src/contrib/streaming/src/java"/>
+ <packageset dir="${tools.src}"/>
<link href="${javadoc.link.java}"/>
@@ -1027,6 +1030,7 @@
<group title="Packages" packages="org.apache.*"/>
<group title="Libraries" packages="org.apache.hadoop.mapred.lib*:org.apache.hadoop.mapreduce.lib*"/>
+ <group title="Tools" packages="org.apache.hadoop.tools"/>
<group title="Examples" packages="org.apache.hadoop.examples*"/>
<group title="contrib: DataJoin" packages="org.apache.hadoop.contrib.utils.join*"/>
@@ -1048,6 +1052,7 @@
</doclet>
<packageset dir="src/java"/>
<packageset dir="src/tools"/>
+ <packageset dir="${tools.src}"/>
<classpath >
<path refid="classpath" />
<path refid="jdiff-classpath" />
@@ -1081,6 +1086,7 @@
</doclet>
<packageset dir="src/java"/>
<packageset dir="src/tools"/>
+ <packageset dir="${tools.src}"/>
<classpath >
<path refid="classpath" />
<path refid="jdiff-classpath"/>
Added: hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml?rev=996420&view=auto
==============================================================================
--- hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml (added)
+++ hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/rumen.xml Mon Sep 13 05:30:29 2010
@@ -0,0 +1,442 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd">
+
+<document>
+
+<header>
+ <title>Rumen</title>
+</header>
+
+<body>
+ <!--
+ Overview [What is Rumen and why is it needed?]
+ -->
+ <section id="overview">
+ <title>Overview</title>
+
+ <p><em>Rumen</em> is a data extraction and analysis tool built for
+ <em>Apache Hadoop</em>. <em>Rumen</em> mines <em>JobHistory</em> logs to
+ extract meaningful data and stores it in an easily-parsed, condensed
+ format or <em>digest</em>. The raw trace data from MapReduce logs are
+ often insufficient for simulation, emulation, and benchmarking, as these
+ tools often attempt to measure conditions that did not occur in the
+ source data. For example, if a task ran locally in the raw trace data
+ but a simulation of the scheduler elects to run that task on a remote
+ rack, the simulator requires a runtime its input cannot provide.
+ To fill in these gaps, Rumen performs a statistical analysis of the
+ digest to estimate the variables the trace doesn't supply. Rumen traces
+ drive both Gridmix (a benchmark of Hadoop MapReduce clusters) and Mumak
+ (a simulator for the JobTracker).
+ </p>
+
+ <!--
+ Why is Rumen needed?
+ -->
+ <section>
+ <title>Motivation</title>
+
+ <ul>
+ <li>Extracting meaningful data from <em>JobHistory</em> logs is a common
+ task for any tool built to work on <em>MapReduce</em>. It
+ is tedious to write a custom tool which is so tightly coupled with
+ the <em>MapReduce</em> framework. Hence there is a need for a
+ built-in tool for performing framework level task of log parsing and
+ analysis. Such a tool would insulate external systems depending on
+ job history against the changes made to the job history format.
+ </li>
+ <li>Performing statistical analysis of various attributes of a
+ <em>MapReduce Job</em> such as <em>task runtimes, task failures
+ etc</em> is another common task that the benchmarking
+ and simulation tools might need. <em>Rumen</em> generates
+ <a href="http://en.wikipedia.org/wiki/Cumulative_distribution_function">
+ <em>Cumulative Distribution Functions (CDF)</em>
+ </a> for the Map/Reduce task runtimes.
+ Runtime CDF can be used for extrapolating the task runtime of
+ incomplete, missing and synthetic tasks. Similarly CDF is also
+ computed for the total number of successful tasks for every attempt.
+
+ </li>
+ </ul>
+ </section>
+
+ <!--
+ Basic high level view of components
+ -->
+ <section>
+ <title>Components</title>
+
+ <p><em>Rumen</em> consists of 2 components</p>
+
+ <ul>
+ <li><em>Trace Builder</em> :
+ Converts <em>JobHistory</em> logs into an easily-parsed format.
+ Currently <code>TraceBuilder</code> outputs the trace in
+ <a href="http://www.json.org/"><em>JSON</em></a>
+ format.
+ </li>
+ <li><em>Folder </em>:
+ A utility to scale the input trace. A trace obtained from
+ <em>TraceBuilder</em> simply summarizes the jobs in the
+ input folders and files. The time-span within which all the jobs in
+ a given trace finish can be considered as the trace runtime.
+ <em>Folder</em> can be used to scale the runtime of a trace.
+ Decreasing the trace runtime might involve dropping some jobs from
+ the input trace and scaling down the runtime of remaining jobs.
+ Increasing the trace runtime might involve adding some dummy jobs to
+ the resulting trace and scaling up the runtime of individual jobs.
+ </li>
+
+ </ul>
+ <p></p><p></p><p></p>
+ </section>
+ </section>
+
+ <!--
+ Usage [How to run Rumen? What are the various configuration parameters?]
+ -->
+ <section id="usage">
+ <title>How to use <em>Rumen</em>?</title>
+
+ <p>Converting <em>JobHistory</em> logs into a desired job-trace consists of
+ 2 steps</p>
+ <ol>
+ <li>Extracting information into an intermediate format</li>
+ <li>Adjusting the job-trace obtained from the intermediate trace to
+ have the desired properties.</li>
+ </ol>
+
+ <note>Extracting information from <em>JobHistory</em> logs is a one time
+ operation. This so called <em>Gold Trace</em> can be reused to
+ generate traces with desired values of properties such as
+ <code>output-duration</code>, <code>concentration</code> etc.
+ </note>
+
+ <p><em>Rumen</em> provides 2 basic commands</p>
+ <ul>
+ <li><code>TraceBuilder</code></li>
+ <li><code>Folder</code></li>
+ </ul>
+
+ <p>Firstly, we need to generate the <em>Gold Trace</em>. Hence the first
+ step is to run <code>TraceBuilder</code> on a job-history folder.
+ The output of the <code>TraceBuilder</code> is a job-trace file (and an
+ optional cluster-topology file). In case we want to scale the output, we
+ can use the <code>Folder</code> utility to fold the current trace to the
+ desired length. The remaining part of this section explains these
+ utilities in detail.
+ </p>
+
+ <note>Examples in this section assumes that certain libraries are present
+ in the java CLASSPATH. See <em>Section-3.2</em> for more details.
+ </note>
+ <!--
+ TraceBuilder command
+ -->
+ <section>
+ <title>Trace Builder</title>
+
+ <p><code>Command:</code></p>
+ <source>java org.apache.hadoop.tools.rumen.TraceBuilder [options] [jobtrace-output] [topology-output] [input]</source>
+
+ <p>This command invokes the <code>TraceBuilder</code> utility of
+ <em>Rumen</em>. It converts the JobHistory files into a series of JSON
+ objects and output them in the <code>[jobtrace-output]</code> file.
+ It also extracts the cluster layout (topology) and outputs it in the
+ <code>[topology-output]</code> file.
+ <code>[input]</code> represents a space separated list of JobHistory
+ files and folders.
+ </p>
+
+ <note>1) Input and output to <code>TraceBuilder</code> is expected to
+ be a fully qualified FileSystem path. So use '<em>file://</em>'
+ to specify files on the <code>local</code> FileSystem and
+ '<em>hdfs://</em>' to specify files on HDFS. Since input files or
+ folder are FileSystem paths, it means that they can be globbed.
+ This can be useful while specifying multiple file paths using
+ regular expressions.
+ </note>
+ <note>
+ 2) TraceBuilder does not recursively scan the input folder for
+ job history files. Only the files that are directly placed under
+ the input folder will be considered for generating the trace.
+ </note>
+
+ <p>Cluster topology is used as follows :</p>
+ <ul>
+ <li>To reconstruct the splits and make sure that the
+ distances/latencies seen in the actual run are modeled correctly.
+ </li>
+ <li>To extrapolate splits information for tasks with missing splits
+ details or synthetically generated tasks.
+ </li>
+ </ul>
+
+ <p><code>Options :</code></p>
+ <table>
+ <tr>
+ <th> Parameter</th>
+ <th> Description</th>
+ <th> Notes </th>
+ </tr>
+ <tr>
+ <td><code>-demuxer</code></td>
+ <td>Used to read the jobhistory files. The default is
+ <code>DefaultInputDemuxer</code>.</td>
+ <td>Demuxer decides how the input file maps to jobhistory file(s).
+ Job history logs and job configuration files are typically small
+ files, and can be more effectively stored when embedded in some
+ container file format like SequenceFile or TFile. To support such
+ usage cases, one can specify a customized Demuxer class that can
+ extract individual job history logs and job configuration files
+ from the source files.
+ </td>
+ </tr>
+ </table>
+
+ <section>
+ <title>Example</title>
+ <source>java org.apache.hadoop.tools.rumen.TraceBuilder file:///home/user/job-trace.json file:///home/user/topology.output file:///home/user/logs/history/done</source>
+ <p></p>
+ <p>This will analyze all the jobs in
+ <code>/home/user/logs/history/done</code> stored on the
+ <code>local</code> FileSystem and output the jobtraces in
+ <code>/home/user/job-trace.json</code> along with topology
+ information in <code>/home/user/topology.output</code>.
+ </p>
+ </section>
+ <p></p><p></p><p></p><p></p><p></p><p></p>
+ </section>
+
+ <!--
+ Folder command
+ -->
+ <section>
+ <title>Folder</title>
+
+ <p><code>Command</code>:</p>
+ <source>java org.apache.hadoop.tools.rumen.Folder [options] [input] [output]</source>
+
+ <note>Input and output to <code>Folder</code> is expected to be a fully
+ qualified FileSystem path. So use '<em>file://</em>' to specify
+ files on the <code>local</code> FileSystem and '<em>hdfs://</em>' to
+ specify files on HDFS.
+ </note>
+
+ <p>This command invokes the <code>Folder</code> utility of
+ <em>Rumen</em>. Folding essentially means that the output duration of
+ the resulting trace is fixed and job timelines are adjusted
+ to respect the final output duration.
+ </p>
+
+ <p></p>
+ <p><code>Options :</code></p>
+ <table>
+ <tr>
+ <th> Parameter</th>
+ <th> Description</th>
+ <th> Notes </th>
+ </tr>
+ <tr>
+ <td><code>-input-cycle</code></td>
+ <td>Defines the basic unit of time for the folding operation. There is
+ no default value for <code>input-cycle</code>.
+ <strong>Input cycle must be provided</strong>.
+ </td>
+ <td>'<code>-input-cycle 10m</code>'
+ implies that the whole trace run will be now sliced at a 10min
+ interval. Basic operations will be done on the 10m chunks. Note
+ that <em>Rumen</em> understands various time units like
+ <em>m(min), h(hour), d(days) etc</em>.
+ </td>
+ </tr>
+ <tr>
+ <td><code>-output-duration</code></td>
+ <td>This parameter defines the final runtime of the trace.
+ Default value if <strong>1 hour</strong>.
+ </td>
+ <td>'<code>-output-duration 30m</code>'
+ implies that the resulting trace will have a max runtime of
+ 30mins. All the jobs in the input trace file will be folded and
+ scaled to fit this window.
+ </td>
+ </tr>
+ <tr>
+ <td><code>-concentration</code></td>
+ <td>Set the concentration of the resulting trace. Default value is
+ <strong>1</strong>.
+ </td>
+ <td>If the total runtime of the resulting trace is less than the total
+ runtime of the input trace, then the resulting trace would contain
+ lesser number of jobs as compared to the input trace. This
+ essentially means that the output is diluted. To increase the
+ density of jobs, set the concentration to a higher value.</td>
+ </tr>
+ <tr>
+ <td><code>-debug</code></td>
+ <td>Run the Folder in debug mode. By default it is set to
+ <strong>false</strong>.</td>
+ <td>In debug mode, the Folder will print additional statements for
+ debugging. Also the intermediate files generated in the scratch
+ directory will not be cleaned up.
+ </td>
+ </tr>
+ <tr>
+ <td><code>-seed</code></td>
+ <td>Initial seed to the Random Number Generator. By default, a Random
+ Number Generator is used to generate a seed and the seed value is
+ reported back to the user for future use.
+ </td>
+ <td>If an initial seed is passed, then the <code>Random Number
+ Generator</code> will generate the random numbers in the same
+ sequence i.e the sequence of random numbers remains same if the
+ same seed is used. Folder uses Random Number Generator to decide
+ whether or not to emit the job.
+ </td>
+ </tr>
+ <tr>
+ <td><code>-temp-directory</code></td>
+ <td>Temporary directory for the Folder. By default the <strong>output
+ folder's parent directory</strong> is used as the scratch space.
+ </td>
+ <td>This is the scratch space used by Folder. All the
+ temporary files are cleaned up in the end unless the Folder is run
+ in <code>debug</code> mode.</td>
+ </tr>
+ <tr>
+ <td><code>-skew-buffer-length</code></td>
+ <td>Enables <em>Folder</em> to tolerate skewed jobs.
+ The default buffer length is <strong>0</strong>.</td>
+ <td>'<code>-skew-buffer-length 100</code>'
+ indicates that if the jobs appear out of order within a window
+ size of 100, then they will be emitted in-order by the folder.
+ If a job appears out-of-order outside this window, then the Folder
+ will bail out provided <code>-allow-missorting</code> is not set.
+ <em>Folder</em> reports the maximum skew size seen in the
+ input trace for future use.
+ </td>
+ </tr>
+ <tr>
+ <td><code>-allow-missorting</code></td>
+ <td>Enables <em>Folder</em> to tolerate out-of-order jobs. By default
+ mis-sorting is not allowed.
+ </td>
+ <td>If mis-sorting is allowed, then the <em>Folder</em> will ignore
+ out-of-order jobs that cannot be deskewed using a skew buffer of
+ size specified using <code>-skew-buffer-length</code>. If
+ mis-sorting is not allowed, then the Folder will bail out if the
+ skew buffer is incapable of tolerating the skew.
+ </td>
+ </tr>
+ </table>
+
+ <section>
+ <title>Examples</title>
+ <section>
+ <title>Folding an input trace with 10 hours of total runtime to
+ generate an output trace with 1 hour of total runtime</title>
+ <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+ <p></p>
+ <p>If the folded jobs are out of order then the command
+ will bail out.
+ </p>
+ <p>
+
+ </p>
+ </section>
+
+ <section>
+ <title>Folding an input trace with 10 hours of total runtime to
+ generate an output trace with 1 hour of total runtime and
+ tolerate some skewness
+ </title>
+ <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m -allow-missorting -skew-buffer-length 100 file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+ <p></p>
+ <p>If the folded jobs are out of order, then atmost
+ 100 jobs will be de-skewed. If the 101<sup>st</sup> job is
+ <em>out-of-order</em>, then the command will bail out.
+ </p>
+ </section>
+ <section>
+ <title>Folding an input trace with 10 hours of total runtime to
+ generate an output trace with 1 hour of total runtime in debug
+ mode
+ </title>
+ <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m -debug -temp-directory file:///tmp/debug file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+ <p></p>
+ <p>This will fold the 10hr job-trace file
+ <code>file:///home/user/job-trace.json</code> to finish within 1hr
+ and use <code>file:///tmp/debug</code> as the temporary directory.
+ The intermediate files in the temporary directory will not be cleaned
+ up.
+ </p>
+ </section>
+
+ <section>
+ <title>Folding an input trace with 10 hours of total runtime to
+ generate an output trace with 1 hour of total runtime with custom
+ concentration.
+ </title>
+ <source>java org.apache.hadoop.tools.rumen.Folder -output-duration 1h -input-cycle 20m -concentration 2 file:///home/user/job-trace.json file:///home/user/job-trace-1hr.json</source>
+ <p></p>
+ <p>This will fold the 10hr job-trace file
+ <code>file:///home/user/job-trace.json</code> to finish within 1hr
+ with concentration of 2. <code>Example-2.3.2</code> will retain 10%
+ of the jobs. With <em>concentration</em> as 2, 20% of the total input
+ jobs will be retained.
+ </p>
+ </section>
+ </section>
+ </section>
+ <p></p><p></p><p></p>
+ </section>
+
+ <!--
+ Appendix [Resources i.e ppts, jiras, definition etc]
+ -->
+ <section>
+ <title>Appendix</title>
+
+ <section>
+ <title>Resources</title>
+ <p><a href="https://issues.apache.org/jira/browse/MAPREDUCE-751">MAPREDUCE-751</a> is the main JIRA that introduced <em>Rumen</em> to <em>MapReduce</em>.
+ Look at the MapReduce <a href="https://issues.apache.org/jira/browse/MAPREDUCE/component/12313617">rumen-component</a> for further details.</p>
+ </section>
+
+ <section>
+ <title>Dependencies</title>
+ <p><em>Rumen</em> expects certain library <em>JARs</em> to be present in
+ the <em>CLASSPATH</em>.
+ The required libraries are </p>
+ <ul>
+ <li><code>Hadoop MapReduce Tools</code> (<code>hadoop-mapred-tools-{hadoop-version}.jar</code>)</li>
+ <li><code>Hadoop Common</code> (<code>hadoop-common-{hadoop-version}.jar</code>)</li>
+ <li><code>Apache Commons Logging</code> (<code>commons-logging-1.1.1.jar</code>)</li>
+ <li><code>Apache Commons CLI</code> (<code>commons-cli-1.2.jar</code>)</li>
+ <li><code>Jackson Mapper</code> (<code>jackson-mapper-asl-1.4.2.jar</code>)</li>
+ <li><code>Jackson Core</code> (<code>jackson-core-asl-1.4.2.jar</code>)</li>
+ </ul>
+
+ <note>One simple way to run Rumen is to use '$HADOOP_HOME/bin/hadoop jar'
+ option to run it.
+ </note>
+ </section>
+ </section>
+</body>
+</document>
Modified: hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml (original)
+++ hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/site.xml Mon Sep 13 05:30:29 2010
@@ -43,6 +43,7 @@ See http://forrest.apache.org/docs/linki
<vaidya label="Vaidya" href="vaidya.html"/>
<archives label="Hadoop Archives" href="hadoop_archives.html"/>
<gridmix label="Gridmix" href="gridmix.html"/>
+ <Rumen label="Rumen" href="rumen.html"/>
</docs>
<docs label="Schedulers">
Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/ClusterStory.java Mon Sep 13 05:30:29 2010
@@ -54,13 +54,13 @@ public interface ClusterStory {
/**
* Get {@link MachineNode} by its host name.
*
- * @return The {@line MachineNode} with the same name. Or null if not found.
+ * @return The {@link MachineNode} with the same name. Or null if not found.
*/
public MachineNode getMachineByName(String name);
/**
* Get {@link RackNode} by its name.
- * @return The {@line RackNode} with the same name. Or null if not found.
+ * @return The {@link RackNode} with the same name. Or null if not found.
*/
public RackNode getRackByName(String name);
Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/DeskewedJobTraceReader.java Mon Sep 13 05:30:29 2010
@@ -72,7 +72,7 @@ public class DeskewedJobTraceReader impl
*
* @param reader
* the {@link JobTraceReader} that's being protected
- * @param skewBufferSize
+ * @param skewBufferLength
* [the number of late jobs that can preced a later out-of-order
* earlier job
* @throws IOException
Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/JobHistoryParserFactory.java Mon Sep 13 05:30:29 2010
@@ -38,7 +38,7 @@ public class JobHistoryParserFactory {
throw new IOException("No suitable parser.");
}
- enum VersionDetector {
+ public enum VersionDetector {
Hadoop20() {
@Override
Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/Node.java Mon Sep 13 05:30:29 2010
@@ -24,7 +24,7 @@ import java.util.TreeSet;
/**
* {@link Node} represents a node in the cluster topology. A node can be a
- * {@MachineNode}, or a {@link RackNode}, etc.
+ * {@link MachineNode}, or a {@link RackNode}, etc.
*/
public class Node implements Comparable<Node> {
private static final SortedSet<Node> EMPTY_SET =
Modified: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java?rev=996420&r1=996419&r2=996420&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java (original)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/TaskAttemptInfo.java Mon Sep 13 05:30:29 2010
@@ -17,7 +17,6 @@
*/
package org.apache.hadoop.tools.rumen;
-import org.apache.hadoop.mapred.TaskStatus;
import org.apache.hadoop.mapred.TaskStatus.State;
/**
@@ -38,7 +37,7 @@ public abstract class TaskAttemptInfo {
}
/**
- * Get the final {@link TaskStatus.State} of the task-attempt.
+ * Get the final {@link State} of the task-attempt.
*
* @return the final <code>State</code> of the task-attempt
*/
Added: hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java?rev=996420&view=auto
==============================================================================
--- hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java (added)
+++ hadoop/mapreduce/trunk/src/tools/org/apache/hadoop/tools/rumen/package-info.java Mon Sep 13 05:30:29 2010
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Rumen is a data extraction and analysis tool built for
+ * <a href="http://hadoop.apache.org/">Apache Hadoop</a>. Rumen mines job history
+ * logs to extract meaningful data and stores it into an easily-parsed format.
+ *
+ * The default output format of Rumen is <a href="http://www.json.org">JSON</a>.
+ * Rumen uses the <a href="http://jackson.codehaus.org/">Jackson</a> library to
+ * create JSON objects.
+ * <br><br>
+ *
+ * The following classes can be used to programmatically invoke Rumen:
+ * <ol>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.JobConfigurationParser}<br>
+ * A parser to parse and filter out interesting properties from job
+ * configuration.
+ *
+ * <br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to parse and filter out job name
+ *
+ * String conf_filename = .. // assume the job configuration filename here
+ *
+ * // construct a list of interesting properties
+ * List<String> interestedProperties = new ArrayList<String>();
+ * interestedProperties.add("mapreduce.job.name");
+ *
+ * JobConfigurationParser jcp =
+ * new JobConfigurationParser(interestedProperties);
+ *
+ * InputStream in = new FileInputStream(conf_filename);
+ * Properties parsedProperties = jcp.parse(in);
+ * </code>
+ * </pre>
+ * Some of the commonly used interesting properties are enumerated in
+ * {@link org.apache.hadoop.tools.rumen.JobConfPropertyNames}. <br><br>
+ *
+ * <b>Note:</b>
+ * A single instance of {@link org.apache.hadoop.tools.rumen.JobConfigurationParser}
+ * can be used to parse multiple job configuration files.
+ *
+ * </li>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.JobHistoryParser} <br>
+ * A parser that parses job history files. It is an interface and actual
+ * implementations are defined as Enum in
+ * {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory}. Note that
+ * {@link org.apache.hadoop.tools.rumen.RewindableInputStream}<br>
+ * is a wrapper class around {@link java.io.InputStream} to make the input
+ * stream rewindable.
+ *
+ * <br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to parse a current job history file i.e a job history
+ * // file for which the version is known
+ *
+ * String filename = .. // assume the job history filename here
+ *
+ * InputStream in = new FileInputStream(filename);
+ *
+ * HistoryEvent event = null;
+ *
+ * JobHistoryParser parser = new CurrentJHParser(in);
+ *
+ * event = parser.nextEvent();
+ * // process all the events
+ * while (event != null) {
+ * // ... process all event
+ * event = parser.nextEvent();
+ * }
+ *
+ * // close the parser and the underlying stream
+ * parser.close();
+ * </code>
+ * </pre>
+ *
+ * {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory} provides a
+ * {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory#getParser(org.apache.hadoop.tools.rumen.RewindableInputStream)}
+ * API to get a parser for parsing the job history file. Note that this
+ * API can be used if the job history version is unknown.<br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to parse a job history for which the version is not
+ * // known i.e using JobHistoryParserFactory.getParser()
+ *
+ * String filename = .. // assume the job history filename here
+ *
+ * InputStream in = new FileInputStream(filename);
+ * RewindableInputStream ris = new RewindableInputStream(in);
+ *
+ * // JobHistoryParserFactory will check and return a parser that can
+ * // parse the file
+ * JobHistoryParser parser = JobHistoryParserFactory.getParser(ris);
+ *
+ * // now use the parser to parse the events
+ * HistoryEvent event = parser.nextEvent();
+ * while (event != null) {
+ * // ... process the event
+ * event = parser.nextEvent();
+ * }
+ *
+ * parser.close();
+ * </code>
+ * </pre>
+ * <b>Note:</b>
+ * Create one instance to parse a job history log and close it after use.
+ * </li>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.TopologyBuilder}<br>
+ * Builds the cluster topology based on the job history events. Every
+ * job history file consists of events. Each event can be represented using
+ * {@link org.apache.hadoop.mapreduce.jobhistory.HistoryEvent}.
+ * These events can be passed to {@link org.apache.hadoop.tools.rumen.TopologyBuilder} using
+ * {@link org.apache.hadoop.tools.rumen.TopologyBuilder#process(org.apache.hadoop.mapreduce.jobhistory.HistoryEvent)}.
+ * A cluster topology can be represented using {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}.
+ * Once all the job history events are processed, the cluster
+ * topology can be obtained using {@link org.apache.hadoop.tools.rumen.TopologyBuilder#build()}.
+ *
+ * <br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // Building topology for a job history file represented using
+ * // 'filename' and the corresponding configuration file represented
+ * // using 'conf_filename'
+ * String filename = .. // assume the job history filename here
+ * String conf_filename = .. // assume the job configuration filename here
+ *
+ * InputStream jobConfInputStream = new FileInputStream(filename);
+ * InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
+ *
+ * TopologyBuilder tb = new TopologyBuilder();
+ *
+ * // construct a list of interesting properties
+ * List<String> interestingProperties = new ArrayList<Strng>();
+ * // add the interesting properties here
+ * interestingProperties.add("mapreduce.job.name");
+ *
+ * JobConfigurationParser jcp =
+ * new JobConfigurationParser(interestingProperties);
+ *
+ * // parse the configuration file
+ * tb.process(jcp.parse(jobConfInputStream));
+ *
+ * // read the job history file and pass it to the
+ * // TopologyBuilder.
+ * JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
+ * HistoryEvent e;
+ *
+ * // read and process all the job history events
+ * while ((e = parser.nextEvent()) != null) {
+ * tb.process(e);
+ * }
+ *
+ * LoggedNetworkTopology topology = tb.build();
+ * </code>
+ * </pre>
+ * </li>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.JobBuilder}<br>
+ * Summarizes a job history file.
+ * {@link org.apache.hadoop.tools.rumen.TraceBuilder} provides
+ * {@link org.apache.hadoop.tools.rumen.TraceBuilder#extractJobID(String)}
+ * API for extracting job id from job history or job configuration files
+ * which can be used for instantiating {@link org.apache.hadoop.tools.rumen.JobBuilder}.
+ * {@link org.apache.hadoop.tools.rumen.JobBuilder} generates a
+ * {@link org.apache.hadoop.tools.rumen.LoggedJob} object via
+ * {@link org.apache.hadoop.tools.rumen.JobBuilder#build()}.
+ * See {@link org.apache.hadoop.tools.rumen.LoggedJob} for more details.
+ *
+ * <br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to summarize a current job history file 'filename'
+ * // and the corresponding configuration file 'conf_filename'
+ *
+ * String filename = .. // assume the job history filename here
+ * String conf_filename = .. // assume the job configuration filename here
+ *
+ * InputStream jobConfInputStream = new FileInputStream(job_filename);
+ * InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
+ *
+ * String jobID = TraceBuilder.extractJobID(job_filename);
+ * JobBuilder jb = new JobBuilder(jobID);
+ *
+ * // construct a list of interesting properties
+ * List<String> interestingProperties = new ArrayList<Strng>();
+ * // add the interesting properties here
+ * interestingProperties.add("mapreduce.job.name");
+ *
+ * JobConfigurationParser jcp =
+ * new JobConfigurationParser(interestingProperties);
+ *
+ * // parse the configuration file
+ * jb.process(jcp.parse(jobConfInputStream));
+ *
+ * // parse the job history file
+ * JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
+ * try {
+ * HistoryEvent e;
+ * // read and process all the job history events
+ * while ((e = parser.nextEvent()) != null) {
+ * jobBuilder.process(e);
+ * }
+ * } finally {
+ * parser.close();
+ * }
+ *
+ * LoggedJob job = jb.build();
+ * </code>
+ * </pre>
+ * <b>Note:</b>
+ * The order of parsing the job configuration file or job history file is
+ * not important. Create one instance to parse the history file and job
+ * configuration.
+ * </li>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.DefaultOutputter}<br>
+ * Implements {@link org.apache.hadoop.tools.rumen.Outputter} and writes
+ * JSON object in text format to the output file.
+ * {@link org.apache.hadoop.tools.rumen.DefaultOutputter} can be
+ * initialized with the output filename.
+ *
+ * <br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to summarize a current job history file represented by
+ * // 'filename' and the configuration filename represented using
+ * // 'conf_filename'. Also output the job summary to 'out.json' along
+ * // with the cluster topology to 'topology.json'.
+ *
+ * String filename = .. // assume the job history filename here
+ * String conf_filename = .. // assume the job configuration filename here
+ *
+ * Configuration conf = new Configuration();
+ * DefaultOutputter do = new DefaultOutputter();
+ * do.init("out.json", conf);
+ *
+ * InputStream jobConfInputStream = new FileInputStream(filename);
+ * InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
+ *
+ * // extract the job-id from the filename
+ * String jobID = TraceBuilder.extractJobID(filename);
+ * JobBuilder jb = new JobBuilder(jobID);
+ * TopologyBuilder tb = new TopologyBuilder();
+ *
+ * // construct a list of interesting properties
+ * List<String> interestingProperties = new ArrayList<Strng>();
+ * // add the interesting properties here
+ * interestingProperties.add("mapreduce.job.name");
+ *
+ * JobConfigurationParser jcp =
+ * new JobConfigurationParser(interestingProperties);
+ *
+ * // parse the configuration file
+ * tb.process(jcp.parse(jobConfInputStream));
+ *
+ * // read the job history file and pass it to the
+ * // TopologyBuilder.
+ * JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
+ * HistoryEvent e;
+ * while ((e = parser.nextEvent()) != null) {
+ * jb.process(e);
+ * tb.process(e);
+ * }
+ *
+ * LoggedJob j = jb.build();
+ *
+ * // serialize the job summary in json (text) format
+ * do.output(j);
+ *
+ * // close
+ * do.close();
+ *
+ * do.init("topology.json", conf);
+ *
+ * // get the job summary using TopologyBuilder
+ * LoggedNetworkTopology topology = topologyBuilder.build();
+ *
+ * // serialize the cluster topology in json (text) format
+ * do.output(topology);
+ *
+ * // close
+ * do.close();
+ * </code>
+ * </pre>
+ * </li>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.JobTraceReader}<br>
+ * A reader for reading {@link org.apache.hadoop.tools.rumen.LoggedJob} serialized using
+ * {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.LoggedJob}
+ * provides various APIs for extracting job details. Following are the most
+ * commonly used ones
+ * <ul>
+ * <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getMapTasks()} : Get the map tasks</li>
+ * <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getReduceTasks()} : Get the reduce tasks</li>
+ * <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getOtherTasks()} : Get the setup/cleanup tasks</li>
+ * <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getOutcome()} : Get the job's outcome</li>
+ * <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getSubmitTime()} : Get the job's submit time</li>
+ * <li>{@link org.apache.hadoop.tools.rumen.LoggedJob#getFinishTime()} : Get the job's finish time</li>
+ * </ul>
+ *
+ * <br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to read job summary from a trace file 'out.json'.
+ * JobTraceReader reader = new JobTracerReader("out.json");
+ * LoggedJob job = reader.getNext();
+ * while (job != null) {
+ * // .... process job level information
+ * for (LoggedTask task : job.getMapTasks()) {
+ * // process all the map tasks in the job
+ * for (LoggedTaskAttempt attempt : task.getAttempts()) {
+ * // process all the map task attempts in the job
+ * }
+ * }
+ *
+ * // get the next job
+ * job = reader.getNext();
+ * }
+ * reader.close();
+ * </code>
+ * </pre>
+ * </li>
+ * <li>
+ * {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader}<br>
+ * A reader to read {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology} serialized using
+ * {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader} can be
+ * initialized using the serialized topology filename.
+ * {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader#get()} can
+ * be used to get the
+ * {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}.
+ *
+ * <br><br>
+ * <i>Sample code</i>:
+ * <pre>
+ * <code>
+ * // An example to read the cluster topology from a topology output file
+ * // 'topology.json'
+ * ClusterTopologyReader reader = new ClusterTopologyReader("topology.json");
+ * LoggedNetworkTopology topology = reader.get();
+ * for (LoggedNetworkTopology t : topology.getChildren()) {
+ * // process the cluster topology
+ * }
+ * reader.close();
+ * </code>
+ * </pre>
+ * </li>
+ * </ol>
+ */
+
+package org.apache.hadoop.tools.rumen;
\ No newline at end of file