You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by om...@apache.org on 2011/03/08 06:59:16 UTC

svn commit: r1079239 - in /hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix: CompressionEmulationUtil.java GenerateData.java Gridmix.java RandomTextDataGenerator.java

Author: omalley
Date: Tue Mar  8 05:59:16 2011
New Revision: 1079239

URL: http://svn.apache.org/viewvc?rev=1079239&view=rev
Log:
commit 8362e614ab0b7a829c8cf73bff1b8e4d24d23444
Author: Amar Ramesh Kamat <am...@yahoo-inc.com>
Date:   Sat Jan 8 11:17:19 2011 +0530

     : Publish compression ratio in Gridmix (amarrk)
    
    +++ b/YAHOO-CHANGES.txt
    +  : Publish compression ratio in Gridmix. Patch is
    +  available at  (amarrk)
    +

Modified:
    hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java
    hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java
    hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
    hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java

Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/CompressionEmulationUtil.java Tue Mar  8 05:59:16 2011
@@ -27,6 +27,7 @@ import org.apache.commons.logging.LogFac
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.LongWritable;
@@ -39,6 +40,7 @@ import org.apache.hadoop.io.compress.Com
 import org.apache.hadoop.io.compress.Decompressor;
 import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Utils;
 import org.apache.hadoop.mapred.gridmix.GenerateData.GenDataFormat;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.MRJobConfig;
@@ -46,6 +48,7 @@ import org.apache.hadoop.mapreduce.Mappe
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.hadoop.util.StringUtils;
 
 /**
  * This is a utility class for all the compression related modules.
@@ -78,13 +81,11 @@ class CompressionEmulationUtil {
     protected void setup(Context context)
         throws IOException, InterruptedException {
       Configuration conf = context.getConfiguration();
-      int size = 
-        conf.getInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE,
-                    100);
+      int listSize = 
+        RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf);
       int wordSize = 
-        conf.getInt(RandomTextDataGenerator.GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE,
-                    10);
-      rtg = new RandomTextDataGenerator(size, null, wordSize);
+        RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf);
+      rtg = new RandomTextDataGenerator(listSize, wordSize);
     }
     
     /**
@@ -112,7 +113,7 @@ class CompressionEmulationUtil {
    */
   static void configure(final Job job) throws IOException, InterruptedException,
                                               ClassNotFoundException {
-    LOG.info("Gridmix is configured to use compressed data.");
+    LOG.info("Gridmix is configured to generate compressed input data.");
     // set the random text mapper
     job.setMapperClass(RandomTextDataMapper.class);
     job.setNumReduceTasks(0);
@@ -130,6 +131,60 @@ class CompressionEmulationUtil {
     }
   }
   
+  /** Publishes compression related data statistics. Following statistics are
+   * published
+   * <ul>
+   *   <li>Total compressed input data size</li>
+   *   <li>Number of compressed input data files</li>
+   *   <li>Compression Ratio</li>
+   *   <li>Text data dictionary size</li>
+   *   <li>Random text word size</li>
+   * </ul>
+   */
+  static void publishCompressedDataStatistics(Path inputDir, Configuration conf,
+                                              long uncompressedDataSize) 
+  throws IOException {
+    LOG.info("Generation of compressed data successful.");
+    FileSystem fs = inputDir.getFileSystem(conf);
+    CompressionCodecFactory compressionCodecs = 
+      new CompressionCodecFactory(conf);
+
+    // iterate over compressed files and sum up the compressed file sizes
+    long compressedDataSize = 0;
+    int numCompressedFiles = 0;
+    // obtain input data file statuses
+    FileStatus[] outFileStatuses = 
+      fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
+    for (FileStatus status : outFileStatuses) {
+      // check if the input file is compressed
+      if (compressionCodecs != null) {
+        CompressionCodec codec = compressionCodecs.getCodec(status.getPath());
+        if (codec != null) {
+          ++numCompressedFiles;
+          compressedDataSize += status.getLen();
+        }
+      }
+    }
+
+    // publish the input data size
+    LOG.info("Total size of compressed input data (bytes) : " 
+             + StringUtils.humanReadableInt(compressedDataSize));
+    LOG.info("Total number of compressed input data files : " 
+             + numCompressedFiles);
+
+    // compute the compression ratio
+    double ratio = ((double)compressedDataSize) / uncompressedDataSize;
+
+    // publish the compression ratio
+    LOG.info("Input Data Compression Ratio : " + ratio);
+
+    // publish the random text data generator configuration parameters
+    LOG.info("Compressed data generator list size : " 
+        + RandomTextDataGenerator.getRandomTextDataGeneratorListSize(conf));
+    LOG.info("Compressed data generator word size : " 
+        + RandomTextDataGenerator.getRandomTextDataGeneratorWordSize(conf));
+  }
+  
   /**
    * Enables/Disables compression emulation.
    * @param conf Target configuration where the parameter 
@@ -179,13 +234,15 @@ class CompressionEmulationUtil {
       CompressionCodecFactory compressionCodecs = 
         new CompressionCodecFactory(conf);
       CompressionCodec codec = compressionCodecs.getCodec(file);
-      Decompressor decompressor = CodecPool.getDecompressor(codec);
       if (codec != null) {
-        CompressionInputStream in = 
-          codec.createInputStream(fs.open(file), decompressor);
-        //TODO Seek doesnt work with compressed input stream. 
-        //     Use SplittableCompressionCodec?
-        return (InputStream)in;
+        Decompressor decompressor = CodecPool.getDecompressor(codec);
+        if (decompressor != null) {
+          CompressionInputStream in = 
+            codec.createInputStream(fs.open(file), decompressor);
+          //TODO Seek doesnt work with compressed input stream. 
+          //     Use SplittableCompressionCodec?
+          return (InputStream)in;
+        }
       }
     }
     FSDataInputStream in = fs.open(file);

Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/GenerateData.java Tue Mar  8 05:59:16 2011
@@ -30,6 +30,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
@@ -41,6 +42,7 @@ import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.ClusterStatus;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Utils;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.Job;
@@ -52,6 +54,7 @@ import org.apache.hadoop.mapreduce.TaskA
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.util.StringUtils;
 
 // TODO can replace with form of GridmixJob
 class GenerateData extends GridmixJob {
@@ -94,6 +97,40 @@ class GenerateData extends GridmixJob {
     FileOutputFormat.setOutputPath(job, outdir);
   }
 
+  /**
+   * Publish the data statistics.
+   */
+  void publishDataStatistics(Path inputDir, long genBytes) throws IOException {
+    if (CompressionEmulationUtil
+        .isCompressionEmulationEnabled(job.getConfiguration())) {
+      CompressionEmulationUtil.publishCompressedDataStatistics(inputDir, 
+                                 job.getConfiguration(), genBytes);
+    } else {
+      publishPlainDataStatistics(job.getConfiguration(), inputDir);
+    }
+  }
+  
+  static void publishPlainDataStatistics(Configuration conf, Path inputDir) 
+  throws IOException {
+    LOG.info("Input data generation successful.");
+    FileSystem fs = inputDir.getFileSystem(conf);
+
+    // obtain input data file statuses
+    FileStatus[] outFileStatuses = 
+      fs.listStatus(inputDir, new Utils.OutputFileUtils.OutputFilesFilter());
+    long dataSize = 0;
+
+    for (FileStatus status : outFileStatuses) {
+      // check if the input file is compressed
+      dataSize += status.getLen();
+    }
+
+    // publish the plain data statistics
+    LOG.info("Total size of input data : " 
+             + StringUtils.humanReadableInt(dataSize));
+    LOG.info("Total number of input data files : " + outFileStatuses.length);
+  }
+  
   @Override
   public Job call() throws IOException, InterruptedException,
                            ClassNotFoundException {

Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/Gridmix.java Tue Mar  8 05:59:16 2011
@@ -111,7 +111,7 @@ public class Gridmix extends Configured 
       throws IOException, InterruptedException {
     Path inputDir = new Path(ioPath, "input");
     final Configuration conf = getConf();
-    final GridmixJob genData = new GenerateData(conf, inputDir, genbytes);
+    final GenerateData genData = new GenerateData(conf, inputDir, genbytes);
     LOG.info("Generating " + StringUtils.humanReadableInt(genbytes) +
         " of test data...");
     launchGridmixJob(genData);
@@ -124,6 +124,9 @@ public class Gridmix extends Configured 
       LOG.error("Couldnt change the file permissions " , e);
       throw new IOException(e);
     }
+    
+    // publish the data statistics
+    genData.publishDataStatistics(inputDir, genbytes);
   }
 
   /**

Modified: hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java?rev=1079239&r1=1079238&r2=1079239&view=diff
==============================================================================
--- hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java (original)
+++ hadoop/mapreduce/branches/yahoo-merge/src/contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/RandomTextDataGenerator.java Tue Mar  8 05:59:16 2011
@@ -22,42 +22,64 @@ import java.util.List;
 import java.util.Random;
 
 import org.apache.commons.lang.RandomStringUtils;
+import org.apache.hadoop.conf.Configuration;
 
 /**
  * A random text generator. The words are simply sequences of alphabets.
  */
 class RandomTextDataGenerator {
   /**
-   * Random words list size.
+   * Configuration key for random text data generator's list size.
    */
   static final String GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE = 
     "gridmix.datagenerator.randomtext.listsize";
   
   /**
-   * Random words size.
+   * Configuration key for random text data generator's word size.
    */
   static final String GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE = 
     "gridmix.datagenerator.randomtext.wordsize";
   
   /**
+   * Default random text data generator's list size.
+   */
+  static final int DEFAULT_LIST_SIZE = 100;
+  
+  /**
+   * Default random text data generator's word size.
+   */
+  static final int DEFAULT_WORD_SIZE = 10;
+  
+  /**
+   * Default random text data generator's seed.
+   */
+  static final long DEFAULT_SEED = 0L;
+  
+  /**
    * A list of random words
    */
   private String[] words;
   private Random random;
   
   /**
+   * Constructor for {@link RandomTextDataGenerator} with default seed.
+   * @param size the total number of words to consider.
+   * @param wordSize Size of each word
+   */
+  RandomTextDataGenerator(int size, int wordSize) {
+    this(size, DEFAULT_SEED , wordSize);
+  }
+  
+  /**
    * Constructor for {@link RandomTextDataGenerator}.
    * @param size the total number of words to consider.
    * @param seed Random number generator seed for repeatability
    * @param wordSize Size of each word
    */
   RandomTextDataGenerator(int size, Long seed, int wordSize) {
-    if (seed == null) {
-      random = new Random();
-    } else {
-      random = new Random(seed);
-    }
+    random = new Random(seed);
     words = new String[size];
+    
     //TODO change the default with the actual stats
     //TODO do u need varied sized words?
     for (int i = 0; i < size; ++i) {
@@ -67,6 +89,20 @@ class RandomTextDataGenerator {
   }
   
   /**
+   * Get the configured random text data generator list size.
+   */
+  static int getRandomTextDataGeneratorListSize(Configuration conf) {
+    return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_LISTSIZE, DEFAULT_LIST_SIZE);
+  }
+  
+  /**
+   * Get the configured random text data generator word size.
+   */
+  static int getRandomTextDataGeneratorWordSize(Configuration conf) {
+    return conf.getInt(GRIDMIX_DATAGEN_RANDOMTEXT_WORDSIZE, DEFAULT_WORD_SIZE);
+  }
+  
+  /**
    * Returns a randomly selected word from a list of random words.
    */
   String getRandomWord() {