You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by om...@apache.org on 2011/03/08 06:59:16 UTC
svn commit: r1079239 - in
/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix:
CompressionEmulationUtil.java GenerateData.java Gridmix.java
RandomTextDataGenerator.java
Author: omalley
Date: Tue Mar 8 05:59:16 2011
New Revision: 1079239
URL: http://svn.apache.org/viewvc?rev=1079239&view=rev
Log:
commit 8362e614ab0b7a829c8cf73bff1b8e4d24d23444
Author: Amar Ramesh Kamat <am...@yahoo-inc.com>
Date: Sat Jan 8 11:17:19 2011 +0530
: Publish compression ratio in Gridmix (amarrk)
+++ b/YAHOO-CHANGES.txt
+ : Publish compression ratio in Gridmix. Patch is
+ available at (amarrk)
+
Modified:
hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java
hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java
hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java
Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java Tue Mar 8 05:59:16 2011
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFac
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
@@ -39,6 +40,7 @@ import org.apache.hadoop.io.compress.Com
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Utils;
import org.apache.hadoop.mapred.gridmix.GenerateData.GenDataFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.MRJobConfig;
@@ -46,6 +48,7 @@ import org.apache.hadoop.mapreduce.Mappe
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
/**
* This is a utility class for all the compression related modules.
@@ -78,13 +81,11 @@ class CompressionEmulationUtil {
protected void setup(Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
- int size =
- conf.getInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE,
- 100);
+ int listSize =
+ RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf);
int wordSize =
- conf.getInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE,
- 10);
- rtg = new RandomTextDataGenerator(size, null, wordSize);
+ RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf);
+ rtg = new RandomTextDataGenerator(listSize, wordSize);
}
/**
@@ -112,7 +113,7 @@ class CompressionEmulationUtil {
*/
static void configure(final Job job) throws IOException, InterruptedException,
ClassNotFoundException {
- LOG.info("Gridmix is configured to use compressed data.");
+ LOG.info("Gridmix is configured to generate compressed input data.");
// set the random text mapper
job.setMapperClass(RandomTextDataMapper.class);
job.setNumReduceTasks(0);
@@ -130,6 +131,60 @@ class CompressionEmulationUtil {
}
}
+ /** Publishes compression related data statistics. Following statistics are
+ * published
+ * <ul>
+ * <li>Total compressed input data size</li>
+ * <li>Number of compressed input data files</li>
+ * <li>Compression Ratio</li>
+ * <li>Text data dictionary size</li>
+ * <li>Random text word size</li>
+ * </ul>
+ */
+ static void publishCompressedDataStatistics(Path inputDir, Configuration conf,
+ long uncompressedDataSize)
+ throws IOException {
+ LOG.info("Generation of compressed data successful.");
+ FileSystem fs = inputDir.getFileSystem(conf);
+ CompressionCodecFactory compressionCodecs =
+ new CompressionCodecFactory(conf);
+
+ // iterate over compressed files and sum up the compressed file sizes
+ long compressedDataSize = 0;
+ int numCompressedFiles = 0;
+ // obtain input data file statuses
+ FileStatus[] outFileStatuses =
+ fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
+ for (FileStatus status : outFileStatuses) {
+ // check if the input file is compressed
+ if (compressionCodecs != null) {
+ CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
+ if (codec != null) {
+ ++numCompressedFiles;
+ compressedDataSize += status.getLen();
+ }
+ }
+ }
+
+ // publish the input data size
+ LOG.info("Total size of compressed input data (bytes) : "
+ + StringUtils.humanReadableInt(compressedDataSize));
+ LOG.info("Total number of compressed input data files : "
+ + numCompressedFiles);
+
+ // compute the compression ratio
+ double ratio = ((double)compressedDataSize) / uncompressedDataSize;
+
+ // publish the compression ratio
+ LOG.info("Input Data Compression Ratio : " + ratio);
+
+ // publish the random text data generator configuration parameters
+ LOG.info("Compressed data generator list size : "
+ + RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf));
+ LOG.info("Compressed data generator word size : "
+ + RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf));
+ }
+
/**
* Enables/Disables compression emulation.
* @param conf Target configuration where the parameter
@@ -179,13 +234,15 @@ class CompressionEmulationUtil {
CompressionCodecFactory compressionCodecs =
new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(file);
- Decompressor decompressor = CodecPool.getDecompressor(codec);
if (codec != null) {
- CompressionInputStream in =
- codec.createInputStream(fs.open(file), decompressor);
- //TODO Seek doesnt work with compressed input stream.
- // Use SplittableCompressionCodec?
- return (InputStream)in;
+ Decompressor decompressor = CodecPool.getDecompressor(codec);
+ if (decompressor != null) {
+ CompressionInputStream in =
+ codec.createInputStream(fs.open(file), decompressor);
+ //TODO Seek doesnt work with compressed input stream.
+ // Use SplittableCompressionCodec?
+ return (InputStream)in;
+ }
}
}
FSDataInputStream in = fs.open(file);
Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java Tue Mar 8 05:59:16 2011
@@ -30,6 +30,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
@@ -41,6 +42,7 @@ import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Utils;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
@@ -52,6 +54,7 @@ import org.apache.hadoop.mapreduce.TaskA
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.util.StringUtils;
// TODO can replace with form of GridmixJob
class GenerateData extends GridmixJob {
@@ -94,6 +97,40 @@ class GenerateData extends GridmixJob {
FileOutputFormat.setOutputPath(job, outdir);
}
+ /**
+ * Publish the data statistics.
+ */
+ void publishDataStatistics(Path inputDir, long genBytes) throws IOException {
+ if (CompressionEmulationUtil
+ .isCompressionEmulationEnabled(job.getConfiguration())) {
+ CompressionEmulationUtil.publishCompressedDataStatistics(inputDir,
+ job.getConfiguration(), genBytes);
+ } else {
+ publishPlainDataStatistics(job.getConfiguration(), inputDir);
+ }
+ }
+
+ static void publishPlainDataStatistics(Configuration conf, Path inputDir)
+ throws IOException {
+ LOG.info("Input data generation successful.");
+ FileSystem fs = inputDir.getFileSystem(conf);
+
+ // obtain input data file statuses
+ FileStatus[] outFileStatuses =
+ fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
+ long dataSize = 0;
+
+ for (FileStatus status : outFileStatuses) {
+ // check if the input file is compressed
+ dataSize += status.getLen();
+ }
+
+ // publish the plain data statistics
+ LOG.info("Total size of input data : "
+ + StringUtils.humanReadableInt(dataSize));
+ LOG.info("Total number of input data files : " + outFileStatuses.length);
+ }
+
@Override
public Job call() throws IOException, InterruptedException,
ClassNotFoundException {
Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java Tue Mar 8 05:59:16 2011
@@ -111,7 +111,7 @@ public class Gridmix extends Configured
throws IOException, InterruptedException {
Path inputDir = new Path(ioPath, "input");
final Configuration conf = getConf();
- final GridmixJob genData = new GenerateData(conf, inputDir, genbytes);
+ final GenerateData genData = new GenerateData(conf, inputDir, genbytes);
LOG.info("Generating " + StringUtils.humanReadableInt(genbytes) +
" of test data...");
launchGridmixJob(genData);
@@ -124,6 +124,9 @@ public class Gridmix extends Configured
LOG.error("Couldnt change the file permissions " , e);
throw new IOException(e);
}
+
+ // publish the data statistics
+ genData.publishDataStatistics(inputDir, genbytes);
}
/**
Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java Tue Mar 8 05:59:16 2011
@@ -22,42 +22,64 @@ import java.util.List;
import java.util.Random;
import org.apache.commons.lang.RandomStringUtils;
+import org.apache.hadoop.conf.Configuration;
/**
* A random text generator. The words are simply sequences of alphabets.
*/
class RandomTextDataGenerator {
/**
- * Random words list size.
+ * Configuration key for random text data generator's list size.
*/
static final String GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE =
"gridmix.datagenerator.randomtext.listsize";
/**
- * Random words size.
+ * Configuration key for random text data generator's word size.
*/
static final String GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE =
"gridmix.datagenerator.randomtext.wordsize";
/**
+ * Default random text data generator's list size.
+ */
+ static final int DEFAULT_LIST_SIZE = 100;
+
+ /**
+ * Default random text data generator's word size.
+ */
+ static final int DEFAULT_WORD_SIZE = 10;
+
+ /**
+ * Default random text data generator's seed.
+ */
+ static final long DEFAULT_SEED = 0L;
+
+ /**
* A list of random words
*/
private String[] words;
private Random random;
/**
+ * Constructor for {@link RandomTextDataGenerator} with default seed.
+ * @param size the total number of words to consider.
+ * @param wordSize Size of each word
+ */
+ RandomTextDataGenerator(int size, int wordSize) {
+ this(size, DEFAULT_SEED , wordSize);
+ }
+
+ /**
* Constructor for {@link RandomTextDataGenerator}.
* @param size the total number of words to consider.
* @param seed Random number generator seed for repeatability
* @param wordSize Size of each word
*/
RandomTextDataGenerator(int size, Long seed, int wordSize) {
- if (seed == null) {
- random = new Random();
- } else {
- random = new Random(seed);
- }
+ random = new Random(seed);
words = new String[size];
+
//TODO change the default with the actual stats
//TODO do u need varied sized words?
for (int i = 0; i < size; ++i) {
@@ -67,6 +89,20 @@ class RandomTextDataGenerator {
}
/**
+ * Get the configured random text data generator list size.
+ */
+ static int getRandomTextDataGeneratorListSize(Configuration conf) {
+ return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, DEFAULT_LIST_SIZE);
+ }
+
+ /**
+ * Get the configured random text data generator word size.
+ */
+ static int getRandomTextDataGeneratorWordSize(Configuration conf) {
+ return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, DEFAULT_WORD_SIZE);
+ }
+
+ /**
* Returns a randomly selected word from a list of random words.
*/
String getRandomWord() {