You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by am...@apache.org on 2011/12/16 15:21:00 UTC
svn commit: r1215141 [1/4] - in
/hadoop/common/trunk/hadoop-mapreduce-project: ./ ivy/
src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/
src/docs/src/documentation/content/xdocs/
src/test/mapred/org/apache/hadoop/tools/rumen/ src/tools/org...
Author: amarrk
Date: Fri Dec 16 14:20:58 2011
New Revision: 1215141
URL: http://svn.apache.org/viewvc?rev=1215141&view=rev
Log:
MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk)
Added:
hadoop/common/trunk/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/tools/rumen/TestRumenAnonymization.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/Anonymizer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/DataAnonymizer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/WordList.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/anonymization/WordListAnonymizerUtility.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/AnonymizableDataType.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/ClassName.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/DataType.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/DefaultAnonymizableDataType.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/DefaultDataType.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/FileName.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/JobName.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/JobProperties.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/NodeName.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/QueueName.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/UserName.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/DefaultJobPropertiesParser.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/JobPropertyParser.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/datatypes/util/MapReduceJobPropertiesParser.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/BlockingSerializer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/DefaultAnonymizingRumenSerializer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/DefaultRumenSerializer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/serializers/ObjectStringSerializer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/State.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/StateDeserializer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/state/StatePool.java
Modified:
hadoop/common/trunk/hadoop-mapreduce-project/CHANGES.txt
hadoop/common/trunk/hadoop-mapreduce-project/ivy.xml
hadoop/common/trunk/hadoop-mapreduce-project/ivy/libraries.properties
hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
hadoop/common/trunk/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml
hadoop/common/trunk/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/tools/rumen/TestRumenFolder.java
hadoop/common/trunk/hadoop-mapreduce-project/src/test/mapred/org/apache/hadoop/tools/rumen/TestRumenJobTraces.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/Folder.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/HadoopLogsAnalyzer.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/JobBuilder.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/JsonObjectMapperWriter.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/LoggedJob.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/LoggedLocation.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/LoggedNetworkTopology.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/LoggedTask.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/LoggedTaskAttempt.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/ParsedHost.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/ZombieCluster.java
hadoop/common/trunk/hadoop-mapreduce-project/src/tools/org/apache/hadoop/tools/rumen/ZombieJob.java
Modified: hadoop/common/trunk/hadoop-mapreduce-project/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/CHANGES.txt?rev=1215141&r1=1215140&r2=1215141&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/CHANGES.txt (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/CHANGES.txt Fri Dec 16 14:20:58 2011
@@ -6,6 +6,7 @@ Trunk (unreleased changes)
MAPREDUCE-3545. Remove Avro RPC. (suresh)
NEW FEATURES
+ MAPREDUCE-778. Rumen Anonymizer. (Amar Kamat and Chris Douglas via amarrk)
MAPREDUCE-2669. Add new examples for Mean, Median, and Standard Deviation.
(Plamen Jeliazkov via shv)
Modified: hadoop/common/trunk/hadoop-mapreduce-project/ivy.xml
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/ivy.xml?rev=1215141&r1=1215140&r2=1215141&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/ivy.xml (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/ivy.xml Fri Dec 16 14:20:58 2011
@@ -139,6 +139,13 @@
<dependency org="org.vafer" name="jdeb" rev="${jdeb.version}" conf="package->master"/>
<dependency org="org.mortbay.jetty" name="jetty-servlet-tester" rev="${jetty.version}"
conf="test->default"/>
+
+ <!-- dependency for rumen anonymization -->
+ <dependency org="org.codehaus.jackson" name="jackson-core-asl" rev="${jackson.version}"
+ conf="compile->default"/>
+ <dependency org="org.codehaus.jackson" name="jackson-mapper-asl" rev="${jackson.version}"
+ conf="compile->default"/>
+
<!-- dependency addition for the fault injection -->
<dependency org="org.aspectj" name="aspectjrt" rev="${aspectj.version}"
conf="compile->default"/>
Modified: hadoop/common/trunk/hadoop-mapreduce-project/ivy/libraries.properties
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/ivy/libraries.properties?rev=1215141&r1=1215140&r2=1215141&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/ivy/libraries.properties (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/ivy/libraries.properties Fri Dec 16 14:20:58 2011
@@ -81,5 +81,6 @@ wagon-http.version=1.0-beta-2
xmlenc.version=0.52
xerces.version=1.4.4
+jackson.version=1.8.2
yarn.version=0.24.0-SNAPSHOT
hadoop-mapreduce.version=0.24.0-SNAPSHOT
Modified: hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java?rev=1215141&r1=1215140&r2=1215141&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GridmixJob.java Fri Dec 16 14:20:58 2011
@@ -26,8 +26,6 @@ import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Delayed;
import java.util.concurrent.TimeUnit;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
import java.security.PrivilegedExceptionAction;
import org.apache.hadoop.conf.Configuration;
@@ -49,6 +47,7 @@ import org.apache.hadoop.mapreduce.serve
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.tools.rumen.JobStory;
+import static org.apache.hadoop.tools.rumen.datatypes.util.MapReduceJobPropertiesParser.extractMaxHeapOpts;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -92,8 +91,6 @@ abstract class GridmixJob implements Cal
// configuration key to enable/disable task jvm options
static final String GRIDMIX_TASK_JVM_OPTIONS_ENABLE =
"gridmix.task.jvm-options.enable";
- private static final Pattern maxHeapPattern =
- Pattern.compile("-Xmx[0-9]+[kKmMgGtT]?+");
private static void setJobQueue(Job job, String queue) {
if (queue != null) {
@@ -225,18 +222,6 @@ abstract class GridmixJob implements Cal
}
}
}
-
- private static void extractMaxHeapOpts(String javaOptions,
- List<String> maxOpts, List<String> others) {
- for (String opt : javaOptions.split(" ")) {
- Matcher matcher = maxHeapPattern.matcher(opt);
- if (matcher.find()) {
- maxOpts.add(opt);
- } else {
- others.add(opt);
- }
- }
- }
// Scales the desired job-level configuration parameter. This API makes sure
// that the ratio of the job level configuration parameter to the cluster
Modified: hadoop/common/trunk/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml
URL: http://svn.apache.org/viewvc/hadoop/common/trunk/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml?rev=1215141&r1=1215140&r2=1215141&view=diff
==============================================================================
--- hadoop/common/trunk/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml (original)
+++ hadoop/common/trunk/hadoop-mapreduce-project/src/docs/src/documentation/content/xdocs/rumen.xml Fri Dec 16 14:20:58 2011
@@ -73,6 +73,11 @@
computed for the total number of successful tasks for every attempt.
</li>
+ <li>Anonymized traces enables sharing of production traces of large
+ scale Hadoop deployments. Sharing of traces will foster
+ collaboration within the Hadoop community. It can also be used to
+ supplement interesting research findings.
+ </li>
</ul>
</section>
@@ -102,6 +107,11 @@
Increasing the trace runtime might involve adding some dummy jobs to
the resulting trace and scaling up the runtime of individual jobs.
</li>
+ <li><em>Anonymizer</em> :
+ A utility to anonymize Hadoop job and cluster topology traces by
+ masking certain sensitive fields but retaining important workload
+ characteristics.
+ </li>
</ul>
<p></p><p></p><p></p>
@@ -128,10 +138,11 @@
<code>output-duration</code>, <code>concentration</code> etc.
</note>
- <p><em>Rumen</em> provides 2 basic commands</p>
+ <p><em>Rumen</em> provides 3 basic commands</p>
<ul>
<li><code>TraceBuilder</code></li>
<li><code>Folder</code></li>
+ <li><code>Anonymizer</code></li>
</ul>
<p>Firstly, we need to generate the <em>Gold Trace</em>. Hence the first
@@ -139,8 +150,9 @@
The output of the <code>TraceBuilder</code> is a job-trace file (and an
optional cluster-topology file). In case we want to scale the output, we
can use the <code>Folder</code> utility to fold the current trace to the
- desired length. The remaining part of this section explains these
- utilities in detail.
+ desired length. For anonymizing the trace, use the
+ <code>Anonymizer</code> utility. The remaining part of this section
+ explains these utilities in detail.
</p>
<note>Examples in this section assumes that certain libraries are present
@@ -426,8 +438,156 @@
</p>
</section>
</section>
+ <p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p>
+ <p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p>
+ <p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p>
+ <p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p><p></p>
+ <p></p><p></p><p></p><p></p>
+
+ </section>
+
+ <!--
+ Anonymizer command
+ -->
+ <section>
+ <title>Anonymizer</title>
+
+ <p><code>Command:</code></p>
+ <source>java org.apache.hadoop.tools.rumen.Anonymizer [options] [-trace <jobtrace-input> <jobtrace-output>] [-topology <topology-input> <topology-output>]</source>
+
+ <p>This command invokes the <em>Anonymizer</em> utility of
+ <em>Rumen</em>. It anonymizes sensitive information from the
+ <code><jobtrace-input></code> file and outputs the anonymized
+ content into the <code><jobtrace-output></code>
+ file. It also anonymizes the cluster layout (topology) from the
+ <code><topology-input></code> and outputs it in
+ the <code><topology-output></code> file.
+ <code><job-input></code> represents the job trace file obtained
+ using <code>TraceBuilder</code> or <code>Folder</code>.
+ <code><topology-input></code> represents the cluster topology
+ file obtained using <code>TraceBuilder</code>.
+ </p>
+
+ <p><code>Options :</code></p>
+ <table>
+ <tr>
+ <th>Parameter</th>
+ <th>Description</th>
+ <th>Notes</th>
+ </tr>
+ <tr>
+ <td><code>-trace</code></td>
+ <td>Anonymizes job traces.</td>
+ <td>Anonymizes sensitive fields like user-name, job-name, queue-name
+ host-names, job configuration parameters etc.</td>
+ </tr>
+ <tr>
+ <td><code>-topology</code></td>
+ <td>Anonymizes cluster topology</td>
+ <td>Anonymizes rack-names and host-names.</td>
+ </tr>
+ </table>
+
+ <section id="anonymizerconf">
+ <title><em>Anonymizer</em> Configuration Parameters</title>
+ <p>The Rumen anonymizer can be configured using the following
+ configuration parameters:
+ </p>
+ <table>
+ <tr>
+ <th>Parameter</th>
+ <th>Description</th>
+ </tr>
+ <tr>
+ <td>
+ <code>rumen.data-types.classname.preserve</code>
+ </td>
+ <td>A comma separated list of prefixes that the <em>Anonymizer</em>
+ will not anonymize while processing classnames. If
+ <code>rumen.data-types.classname.preserve</code> is set to
+ <code>'org.apache,com.hadoop.'</code> then
+ classnames starting with <code>'org.apache'</code> or
+ <code>'com.hadoop.'</code> will not be anonymized.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>rumen.datatypes.jobproperties.parsers</code>
+ </td>
+ <td>A comma separated list of job properties parsers. These parsers
+ decide how the job configuration parameters
+ (i.e <key,value> pairs) should be processed. Default is
+ <code>MapReduceJobPropertiesParser</code>. The default parser will
+ only parse framework-level MapReduce specific job configuration
+ properties. Users can add custom parsers by implementing the
+ <code>JobPropertiesParser</code> interface. Rumen also provides an
+ all-pass (i.e no filter) parser called
+ <code>DefaultJobPropertiesParser</code>.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>rumen.anonymization.states.dir</code>
+ </td>
+ <td>Set this to a location (on LocalFileSystem or HDFS) for enabling
+ state persistence and/or reload. This parameter is not set by
+ default. Reloading and persistence of states depend on the state
+ directory. Note that the state directory will contain the latest
+ as well as previous states.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>rumen.anonymization.states.persist</code>
+ </td>
+ <td>Set this to <code>'true'</code> to persist the current state.
+ Default value is <code>'false'</code>. Note that the states will
+ be persisted to the state manager's state directory
+ specified using the <code>rumen.anonymization.states.dir</code>
+ parameter.
+ </td>
+ </tr>
+ <tr>
+ <td>
+ <code>rumen.anonymization.states.reload</code>
+ </td>
+ <td>Set this to <code>'true'</code> to enable reuse of previously
+ persisted state. The default value is <code>'false'</code>. The
+ previously persisted state will be reloaded from the state
+ manager's state directory specified using the
+ <code>rumen.anonymization.states.dir</code> parameter. Note that
+ the <em>Anonymizer</em> will bail out if it fails to find any
+ previously persisted state in the state directory or if the state
+ directory is not set. If the user wishes to retain/reuse the
+ states across multiple invocations of the <em>Anonymizer</em>,
+ then the very first invocation of the <em>Anonymizer</em> should
+ have <code>rumen.anonymization.states.reload</code> set to
+ <code>'false'</code> and
+ <code>rumen.anonymization.states.persist</code> set to
+ <code>'true'</code>. Subsequent invocations of the
+ <em>Anonymizer</em> can then have
+ <code>rumen.anonymization.states.reload</code> set to
+ <code>'true'</code>.
+ </td>
+ </tr>
+ </table>
+ </section>
+
+ <section>
+ <title>Example</title>
+ <source>java org.apache.hadoop.tools.rumen.Anonymizer -trace file:///home/user/job-trace.json file:///home/user/job-trace-anonymized.json -topology file:///home/user/cluster-topology.json file:///home/user/cluster-topology-anonymized.json</source>
+ <p></p>
+ <p>This will anonymize the job details from
+ <code>file:///home/user/job-trace.json</code> and output it to
+ <code>file:///home/user/job-trace-anonymized.json</code>.
+ It will also anonymize the cluster topology layout from
+ <code>file:///home/user/cluster-topology.json</code> and output it to
+ <code>file:///home/user/cluster-topology-anonymized.json</code>.
+ Note that the <code>Anonymizer</code> also supports input and output
+ files on HDFS.
+ </p>
+ </section>
</section>
- <p></p><p></p><p></p>
</section>
<!--
@@ -452,8 +612,8 @@
<li><code>Hadoop Common</code> (<code>hadoop-common-{hadoop-version}.jar</code>)</li>
<li><code>Apache Commons Logging</code> (<code>commons-logging-1.1.1.jar</code>)</li>
<li><code>Apache Commons CLI</code> (<code>commons-cli-1.2.jar</code>)</li>
- <li><code>Jackson Mapper</code> (<code>jackson-mapper-asl-1.4.2.jar</code>)</li>
- <li><code>Jackson Core</code> (<code>jackson-core-asl-1.4.2.jar</code>)</li>
+ <li><code>Jackson Mapper</code> (<code>jackson-mapper-asl-1.8.2.jar</code>)</li>
+ <li><code>Jackson Core</code> (<code>jackson-core-asl-1.8.2.jar</code>)</li>
</ul>
<note>One simple way to run Rumen is to use '$HADOOP_PREFIX/bin/hadoop jar'