You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by ra...@apache.org on 2011/05/23 16:22:36 UTC

svn commit: r1126499 [2/2] - in /hadoop/mapreduce/trunk/src: contrib/gridmix/src/java/org/apache/hadoop/mapred/gridmix/ contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/ docs/src/documentation/content/xdocs/

Added: hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestDistCacheEmulation.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestDistCacheEmulation.java?rev=1126499&view=auto
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestDistCacheEmulation.java (added)
+++ hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestDistCacheEmulation.java Mon May 23 14:22:36 2011
@@ -0,0 +1,513 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.mapred.gridmix;
+
+import static org.junit.Assert.*;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.LocatedFileStatus;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
+import org.apache.hadoop.fs.permission.FsAction;
+import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.MRJobConfig;
+import org.apache.hadoop.mapreduce.MapContext;
+import org.apache.hadoop.mapreduce.MapReduceTestUtil;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.task.MapContextImpl;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Validate emulation of distributed cache load in gridmix simulated jobs.
+ *
+ */
+public class TestDistCacheEmulation {
+
+  private DistributedCacheEmulator dce = null;
+
+  @BeforeClass
+  public static void init() throws IOException {
+    GridmixTestUtils.initCluster();
+  }
+
+  @AfterClass
+  public static void shutDown() throws IOException {
+    GridmixTestUtils.shutdownCluster();
+  }
+
+  /**
+   * Validate the dist cache files generated by GenerateDistCacheData job.
+   * @param jobConf configuration of GenerateDistCacheData job.
+   * @param sortedFileSizes array of sorted distributed cache file sizes 
+   * @throws IOException 
+   * @throws FileNotFoundException 
+   */
+  private void validateDistCacheData(JobConf jobConf, long[] sortedFileSizes)
+      throws FileNotFoundException, IOException {
+    Path distCachePath = dce.getDistributedCacheDir();
+    String filesListFile =
+        jobConf.get(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST);
+    FileSystem fs = FileSystem.get(jobConf);
+
+    // Validate the existence of Distributed Cache files list file directly
+    // under distributed cache directory
+    Path listFile = new Path(filesListFile);
+    assertTrue("Path of Distributed Cache files list file is wrong.",
+        distCachePath.equals(listFile.getParent().makeQualified(fs)));
+
+    // Delete the dist cache files list file
+    assertTrue("Failed to delete distributed Cache files list file " + listFile,
+               fs.delete(listFile));
+
+    List<Long> fileSizes = new ArrayList<Long>();
+    for (long size : sortedFileSizes) {
+      fileSizes.add(size);
+    }
+    // validate dist cache files after deleting the 'files list file'
+    validateDistCacheFiles(fileSizes, distCachePath);
+  }
+
+  /**
+   * Validate private/public distributed cache files.
+   * @param filesSizesExpected list of sizes of expected dist cache files
+   * @param distCacheDir the distributed cache dir to be validated
+   * @throws IOException 
+   * @throws FileNotFoundException 
+   */
+  private void validateDistCacheFiles(List filesSizesExpected,
+      Path distCacheDir) throws FileNotFoundException, IOException {
+    RemoteIterator<LocatedFileStatus> iter =
+        GridmixTestUtils.dfs.listFiles(distCacheDir, false);
+    int numFiles = filesSizesExpected.size();
+    for (int i = 0; i < numFiles; i++) {
+      assertTrue("Missing distributed cache files.", iter.hasNext());
+      LocatedFileStatus stat = iter.next();
+      assertTrue("File size of distributed cache file "
+          + stat.getPath().toUri().getPath() + " is wrong.",
+          filesSizesExpected.remove(stat.getLen()));
+
+      FsPermission perm = stat.getPermission();
+      assertEquals("Wrong permissions for distributed cache file "
+          + stat.getPath().toUri().getPath(),
+          new FsPermission((short)0644), perm);
+    }
+    assertFalse("Number of files under distributed cache dir is wrong.",
+        iter.hasNext());
+  }
+
+  /**
+   * Configures 5 HDFS-based dist cache files and 1 local-FS-based dist cache
+   * file in the given Configuration object <code>conf</code>.
+   * @param conf configuration where dist cache config properties are to be set
+   * @param useOldProperties <code>true</code> if old config properties are to
+   *                         be set
+   * @return array of sorted HDFS-based distributed cache file sizes
+   * @throws IOException
+   */
+  private long[] configureDummyDistCacheFiles(Configuration conf,
+      boolean useOldProperties) throws IOException {
+    String user = UserGroupInformation.getCurrentUser().getShortUserName();
+    conf.set(MRJobConfig.USER_NAME, user);
+    // Set some dummy dist cache files in gridmix configuration so that they go
+    // into the configuration of JobStory objects.
+    String[] distCacheFiles = {"hdfs:///tmp/file1.txt",
+                               "/tmp/" + user + "/.staging/job_1/file2.txt",
+                               "hdfs:///user/user1/file3.txt",
+                               "/home/user2/file4.txt",
+                               "subdir1/file5.txt",
+                               "subdir2/file6.gz"};
+    String[] fileSizes = {"400", "2500", "700", "1200", "1500", "500"};
+
+    String[] visibilities = {"true", "false", "false", "true", "true", "false"};
+    String[] timeStamps = {"1234", "2345", "34567", "5434", "125", "134"};
+    if (useOldProperties) {
+      conf.setStrings("mapred.cache.files", distCacheFiles);
+      conf.setStrings("mapred.cache.files.filesizes", fileSizes);
+      conf.setStrings("mapred.cache.files.visibilities", visibilities);
+      conf.setStrings("mapred.cache.files.timestamps", timeStamps);
+    } else {
+      conf.setStrings(MRJobConfig.CACHE_FILES, distCacheFiles);
+      conf.setStrings(MRJobConfig.CACHE_FILES_SIZES, fileSizes);
+      conf.setStrings(MRJobConfig.CACHE_FILE_VISIBILITIES, visibilities);
+      conf.setStrings(MRJobConfig.CACHE_FILE_TIMESTAMPS, timeStamps);
+    }
+    // local FS based dist cache file whose path contains <user>/.staging is
+    // not created on HDFS. So file size 2500 is not added to sortedFileSizes.
+    long[] sortedFileSizes = new long[] {1500, 1200, 700, 500, 400};
+    return sortedFileSizes;
+  }
+
+  /**
+   * Runs setupGenerateDistCacheData() on a new DistrbutedCacheEmulator and
+   * and returns the jobConf. Fills the array <code>sortedFileSizes</code> that
+   * can be used for validation.
+   * Validation of exit code from setupGenerateDistCacheData() is done.
+   * @param generate true if -generate option is specified
+   * @param sortedFileSizes sorted HDFS-based distributed cache file sizes
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  private JobConf runSetupGenerateDistCacheData(boolean generate,
+      long[] sortedFileSizes) throws IOException, InterruptedException {
+    Configuration conf = new Configuration();
+    long[] fileSizes = configureDummyDistCacheFiles(conf, false);
+    System.arraycopy(fileSizes, 0, sortedFileSizes, 0, fileSizes.length);
+
+    // Job stories of all 3 jobs will have same dist cache files in their
+    // configurations
+    final int numJobs = 3;
+    DebugJobProducer jobProducer = new DebugJobProducer(numJobs, conf);
+
+    JobConf jobConf =
+        GridmixTestUtils.mrCluster.createJobConf(new JobConf(conf));
+    Path ioPath = new Path("testSetupGenerateDistCacheData")
+                    .makeQualified(GridmixTestUtils.dfs);
+    FileSystem fs = FileSystem.get(jobConf);
+    if (fs.exists(ioPath)) {
+      fs.delete(ioPath, true);
+    }
+    FileSystem.mkdirs(fs, ioPath, new FsPermission((short)0777));
+
+    dce = createDistributedCacheEmulator(jobConf, ioPath, generate);
+    int exitCode = dce.setupGenerateDistCacheData(jobProducer);
+    int expectedExitCode = generate ? 0 : dce.MISSING_DIST_CACHE_FILES_ERROR;
+    assertEquals("setupGenerateDistCacheData failed.",
+                 expectedExitCode, exitCode);
+
+    // reset back
+    resetDistCacheConfigProperties(jobConf);
+    return jobConf;
+  }
+
+  /**
+   * Reset the config properties related to Distributed Cache in the given
+   * job configuration <code>jobConf</code>.
+   * @param jobConf job configuration
+   */
+  private void resetDistCacheConfigProperties(JobConf jobConf) {
+    // reset current/latest property names
+    jobConf.setStrings(MRJobConfig.CACHE_FILES, "");
+    jobConf.setStrings(MRJobConfig.CACHE_FILES_SIZES, "");
+    jobConf.setStrings(MRJobConfig.CACHE_FILE_TIMESTAMPS, "");
+    jobConf.setStrings(MRJobConfig.CACHE_FILE_VISIBILITIES, "");
+    // reset old property names
+    jobConf.setStrings("mapred.cache.files", "");
+    jobConf.setStrings("mapred.cache.files.filesizes", "");
+    jobConf.setStrings("mapred.cache.files.visibilities", "");
+    jobConf.setStrings("mapred.cache.files.timestamps", "");
+  }
+
+  /**
+   * Validate GenerateDistCacheData job if it creates dist cache files properly.
+   * @throws Exception
+   */
+  @Test
+  public void testGenerateDistCacheData() throws Exception {
+    long[] sortedFileSizes = new long[5];
+    JobConf jobConf =
+        runSetupGenerateDistCacheData(true, sortedFileSizes);
+    GridmixJob gridmixJob = new GenerateDistCacheData(jobConf);
+    Job job = gridmixJob.call();
+    assertEquals("Number of reduce tasks in GenerateDistCacheData is not 0.",
+        0, job.getNumReduceTasks());
+    assertTrue("GenerateDistCacheData job failed.",
+        job.waitForCompletion(false));
+    validateDistCacheData(jobConf, sortedFileSizes);
+  }
+
+  /**
+   *  Validate setupGenerateDistCacheData by validating
+   *  <li> permissions of the distributed cache directories and
+   *  <li> content of the generated sequence file. This includes validation of
+   *       dist cache file paths and their file sizes.
+   */
+  private void validateSetupGenDC(JobConf jobConf, long[] sortedFileSizes)
+      throws IOException, InterruptedException {
+    // build things needed for validation
+    long sumOfFileSizes = 0;
+    for (int i = 0; i < sortedFileSizes.length; i++) {
+      sumOfFileSizes += sortedFileSizes[i];
+    }
+
+    FileSystem fs = FileSystem.get(jobConf);
+    assertEquals("Number of distributed cache files to be generated is wrong.",
+        sortedFileSizes.length,
+        jobConf.getInt(GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_COUNT, -1));
+    assertEquals("Total size of dist cache files to be generated is wrong.",
+        sumOfFileSizes, jobConf.getLong(
+        GenerateDistCacheData.GRIDMIX_DISTCACHE_BYTE_COUNT, -1));
+    Path filesListFile = new Path(jobConf.get(
+        GenerateDistCacheData.GRIDMIX_DISTCACHE_FILE_LIST));
+    FileStatus stat = fs.getFileStatus(filesListFile);
+    assertEquals("Wrong permissions of dist Cache files list file "
+        + filesListFile, new FsPermission((short)0644), stat.getPermission());
+
+    InputSplit split =
+        new FileSplit(filesListFile, 0, stat.getLen(), (String[])null);
+    TaskAttemptContext taskContext =
+        MapReduceTestUtil.createDummyMapTaskAttemptContext(jobConf);
+    RecordReader<LongWritable, BytesWritable> reader =
+      new GenerateDistCacheData.GenDCDataFormat().createRecordReader(
+      split, taskContext);
+    MapContext<LongWritable, BytesWritable, NullWritable, BytesWritable>
+        mapContext = new MapContextImpl<LongWritable, BytesWritable,
+        NullWritable, BytesWritable>(jobConf, taskContext.getTaskAttemptID(),
+        reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
+    reader.initialize(split, mapContext);
+
+    // start validating setupGenerateDistCacheData
+    doValidateSetupGenDC(reader, fs, sortedFileSizes);
+  }
+
+  /**
+   *  Validate setupGenerateDistCacheData by validating
+   *  <li> permissions of the distributed cache directory and
+   *  <li> content of the generated sequence file. This includes validation of
+   *       dist cache file paths and their file sizes.
+   */
+  private void doValidateSetupGenDC(RecordReader<LongWritable, BytesWritable>
+      reader, FileSystem fs, long[] sortedFileSizes)
+      throws IOException, InterruptedException {
+
+    // Validate permissions of dist cache directory
+    Path distCacheDir = dce.getDistributedCacheDir();
+    assertEquals("Wrong permissions for distributed cache dir " + distCacheDir,
+        fs.getFileStatus(distCacheDir).getPermission()
+        .getOtherAction().and(FsAction.EXECUTE), FsAction.EXECUTE);
+
+    // Validate the content of the sequence file generated by
+    // dce.setupGenerateDistCacheData().
+    LongWritable key = new LongWritable();
+    BytesWritable val = new BytesWritable();
+    for (int i = 0; i < sortedFileSizes.length; i++) {
+      assertTrue("Number of files written to the sequence file by "
+          + "setupGenerateDistCacheData is less than the expected.",
+          reader.nextKeyValue());
+      key = reader.getCurrentKey();
+      val = reader.getCurrentValue();
+      long fileSize = key.get();
+      String file = new String(val.getBytes(), 0, val.getLength());
+
+      // Dist Cache files should be sorted based on file size.
+      assertEquals("Dist cache file size is wrong.",
+          sortedFileSizes[i], fileSize);
+
+      // Validate dist cache file path.
+
+      // parent dir of dist cache file
+      Path parent = new Path(file).getParent().makeQualified(fs);
+      // should exist in dist cache dir
+      assertTrue("Public dist cache file path is wrong.",
+          distCacheDir.equals(parent));
+    }
+  }
+
+  /**
+   *  Test if DistributedCacheEmulator's setup of GenerateDistCacheData is
+   *  working as expected.
+   * @throws IOException
+   * @throws InterruptedException
+   */
+  @Test
+  public void testSetupGenerateDistCacheData()
+      throws IOException, InterruptedException {
+    long[] sortedFileSizes = new long[5];
+    JobConf jobConf = runSetupGenerateDistCacheData(true, sortedFileSizes);
+    validateSetupGenDC(jobConf, sortedFileSizes);
+
+    // Verify if correct exit code is seen when -generate option is missing and
+    // distributed cache files are missing in the expected path.
+    runSetupGenerateDistCacheData(false, sortedFileSizes);
+  }
+
+  /**
+   *  Create DistributedCacheEmulator object and do the initialization by
+   *  calling init() on it with dummy trace. Also configure the pseudo local FS.
+   */
+  private DistributedCacheEmulator createDistributedCacheEmulator(
+      Configuration conf, Path ioPath, boolean generate) throws IOException {
+    DistributedCacheEmulator dce =
+        new DistributedCacheEmulator(conf, ioPath);
+    JobCreator jobCreator = JobCreator.getPolicy(conf, JobCreator.LOADJOB);
+    jobCreator.setDistCacheEmulator(dce);
+    dce.init("dummytrace", jobCreator, generate);
+    return dce;
+  }
+
+  /**
+   *  Test the configuration property for disabling/enabling emulation of
+   *  distributed cache load.
+   */
+  @Test
+  public void testDistCacheEmulationConfigurability() throws IOException {
+    Configuration conf = new Configuration();
+    JobConf jobConf = GridmixTestUtils.mrCluster.createJobConf(
+        new JobConf(conf));
+    Path ioPath = new Path("testDistCacheEmulationConfigurability")
+        .makeQualified(GridmixTestUtils.dfs);
+    FileSystem fs = FileSystem.get(jobConf);
+    FileSystem.mkdirs(fs, ioPath, new FsPermission((short)0777));
+
+    // default config
+    dce = createDistributedCacheEmulator(jobConf, ioPath, false);
+    assertTrue("Default configuration of "
+        + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
+        + " is wrong.", dce.shouldEmulateDistCacheLoad());
+
+    // config property set to false
+    jobConf.setBoolean(
+        DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE, false);
+    dce = createDistributedCacheEmulator(jobConf, ioPath, false);
+    assertFalse("Disabling of emulation of distributed cache load by setting "
+        + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
+        + " to false is not working.", dce.shouldEmulateDistCacheLoad());
+  }
+
+  /**
+   * Verify if DistributedCacheEmulator can configure distributed cache files
+   * for simulated job if job conf from trace had no dist cache files.
+   * @param conf configuration for the simulated job to be run
+   * @param jobConf job configuration of original cluster's job, obtained from
+   *                trace
+   * @throws IOException
+   */
+  private void validateJobConfWithOutDCFiles(Configuration conf,
+      JobConf jobConf) throws IOException {
+    // Validate if Gridmix can configure dist cache files properly if there are
+    // no HDFS-based dist cache files and localFS-based dist cache files in
+    // trace for a job.
+    dce.configureDistCacheFiles(conf, jobConf);
+    assertNull("Distributed cache files configured by GridMix is wrong.",
+               conf.get(MRJobConfig.CACHE_FILES));
+    assertNull("Distributed cache files configured by Gridmix through -files "
+               + "option is wrong.", conf.get("tmpfiles"));
+  }
+
+  /**
+   * Verify if DistributedCacheEmulator can configure distributed cache files
+   * for simulated job if job conf from trace had HDFS-based dist cache files
+   * and local-FS-based dist cache files.
+   * <br>Also validate if Gridmix can handle/read deprecated config properties
+   * like mapred.cache.files.filesizes and mapred.cache.files.visibilities from
+   * trace file.
+   * @param conf configuration for the simulated job to be run
+   * @param jobConf job configuration of original cluster's job, obtained from
+   *                trace
+   * @throws IOException
+   */
+  private void validateJobConfWithDCFiles(Configuration conf,
+      JobConf jobConf) throws IOException {
+    long[] sortedFileSizes = configureDummyDistCacheFiles(jobConf, true);
+
+    // Validate if Gridmix can handle deprecated config properties like
+    // mapred.cache.files.filesizes and mapred.cache.files.visibilities.
+    // 1 local FS based dist cache file and 5 HDFS based dist cache files. So
+    // total expected dist cache files count is 6.
+    assertEquals("Gridmix is not able to extract dist cache file sizes.",
+                 6, jobConf.getStrings(MRJobConfig.CACHE_FILES_SIZES).length);
+    assertEquals("Gridmix is not able to extract dist cache file visibilities.",
+                 6, jobConf.getStrings(MRJobConfig.CACHE_FILE_VISIBILITIES).length);
+
+    dce.configureDistCacheFiles(conf, jobConf);
+
+    assertEquals("Configuring of HDFS-based dist cache files by gridmix is "
+                 + "wrong.", sortedFileSizes.length,
+                 conf.getStrings(MRJobConfig.CACHE_FILES).length);
+    assertEquals("Configuring of local-FS-based dist cache files by gridmix is "
+                 + "wrong.", 1, conf.getStrings("tmpfiles").length);
+  }
+
+  /**
+   * Verify if configureDistCacheFiles() works fine when there are distributed
+   * cache files set but visibilities are not set. This is to handle history
+   * traces of older hadoop version where there are no private/public
+   * Distributed Caches.
+   * @throws IOException
+   */
+  private void validateWithOutVisibilities() throws IOException {
+    Configuration conf = new Configuration();// configuration for simulated job
+    JobConf jobConf = new JobConf();
+    String user = "user1";
+    jobConf.setUser(user);
+    String[] files = {"/tmp/hdfs1.txt", "/tmp/"+ user + "/.staging/file1"};
+    jobConf.setStrings(MRJobConfig.CACHE_FILES, files);
+    jobConf.setStrings(MRJobConfig.CACHE_FILES_SIZES, "12,200");
+    jobConf.setStrings(MRJobConfig.CACHE_FILE_TIMESTAMPS, "56789,98345");
+    dce.configureDistCacheFiles(conf, jobConf);
+    assertEquals("Configuring of HDFS-based dist cache files by gridmix is "
+                 + "wrong.", files.length,
+                 conf.getStrings(MRJobConfig.CACHE_FILES).length);
+    assertNull("Configuring of local-FS-based dist cache files by gridmix is "
+               + "wrong.", conf.get("tmpfiles"));
+  }
+
+  /**
+   * Test if Gridmix can configure config properties related to Distributed
+   * Cache properly. Also verify if Gridmix can handle deprecated config
+   * properties related to Distributed Cache.
+   * @throws IOException
+   */
+  @Test
+  public void testDistCacheFilesConfiguration() throws IOException {
+    Configuration conf = new Configuration();
+    JobConf jobConf = GridmixTestUtils.mrCluster.createJobConf(
+                        new JobConf(conf));
+    Path ioPath = new Path("testDistCacheEmulationConfigurability")
+                    .makeQualified(GridmixTestUtils.dfs);
+    FileSystem fs = FileSystem.get(jobConf);
+    FileSystem.mkdirs(fs, ioPath, new FsPermission((short)0777));
+
+    // default config
+    dce = createDistributedCacheEmulator(jobConf, ioPath, false);
+    assertTrue("Default configuration of "
+               + DistributedCacheEmulator.GRIDMIX_EMULATE_DISTRIBUTEDCACHE
+               + " is wrong.", dce.shouldEmulateDistCacheLoad());
+
+    // Validate if DistributedCacheEmulator can handle a JobStory with out
+    // Distributed Cache files properly.
+    validateJobConfWithOutDCFiles(conf, jobConf);
+
+    // Validate if Gridmix can configure dist cache files properly if there are
+    // HDFS-based dist cache files and localFS-based dist cache files in trace
+    // for a job. Set old config properties and validate.
+    validateJobConfWithDCFiles(conf, jobConf);
+    
+    // Use new JobConf as JobStory conf and check if configureDistCacheFiles()
+    // doesn't throw NPE when there are dist cache files set but visibilities
+    // are not set.
+    validateWithOutVisibilities();
+  }
+}

Modified: hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java?rev=1126499&r1=1126498&r2=1126499&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java (original)
+++ hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestGridmixSubmission.java Mon May 23 14:22:36 2011
@@ -111,7 +111,7 @@ public class TestGridmixSubmission {
         GridmixTestUtils.mrCluster.createJobConf());
       for (Job job : succeeded) {
         final String jobname = job.getJobName();
-        if ("GRIDMIX_GENDATA".equals(jobname)) {
+        if (GenerateData.JOB_NAME.equals(jobname)) {
           if (!job.getConfiguration().getBoolean(
             GridmixJob.GRIDMIX_USE_QUEUE_IN_TRACE, true)) {
             assertEquals(" Improper queue for " + job.getJobName(),

Added: hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestPseudoLocalFs.java
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestPseudoLocalFs.java?rev=1126499&view=auto
==============================================================================
--- hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestPseudoLocalFs.java (added)
+++ hadoop/mapreduce/trunk/src/contrib/gridmix/src/test/org/apache/hadoop/mapred/gridmix/TestPseudoLocalFs.java Mon May 23 14:22:36 2011
@@ -0,0 +1,233 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.mapred.gridmix;
+
+import static org.junit.Assert.*;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.junit.Test;
+
+/**
+ * Test the basic functionality of PseudoLocalFs
+ */
+public class TestPseudoLocalFs {
+
+  /**
+   * Test if a file on PseudoLocalFs of a specific size can be opened and read.
+   * Validate the size of the data read.
+   * Test the read methods of {@link PseudoLocalFs.RandomInputStream}.
+   * @throws Exception
+   */
+  @Test
+  public void testPseudoLocalFsFileSize() throws Exception {
+    long fileSize = 10000;
+    Path path = PseudoLocalFs.generateFilePath("myPsedoFile", fileSize);
+    PseudoLocalFs pfs = new PseudoLocalFs();
+    pfs.create(path);
+
+    // Read 1 byte at a time and validate file size.
+    InputStream in = pfs.open(path, 0);
+    long totalSize = 0;
+
+    while (in.read() >= 0) {
+      ++totalSize;
+    }
+    in.close();
+    assertEquals("File size mismatch with read().", fileSize, totalSize);
+
+    // Read data from PseudoLocalFs-based file into buffer to
+    // validate read(byte[]) and file size.
+    in = pfs.open(path, 0);
+    totalSize = 0;
+    byte[] b = new byte[1024];
+    int bytesRead = in.read(b);
+    while (bytesRead >= 0) {
+      totalSize += bytesRead;
+      bytesRead = in.read(b);
+    }
+    assertEquals("File size mismatch with read(byte[]).", fileSize, totalSize);
+  }
+
+  /**
+   * Validate if file status is obtained for correctly formed file paths on
+   * PseudoLocalFs and also verify if appropriate exception is thrown for
+   * invalid file paths.
+   * @param pfs Pseudo Local File System
+   * @param path file path for which getFileStatus() is to be called
+   * @param shouldSucceed <code>true</code> if getFileStatus() should succeed
+   * @throws IOException
+   */
+  private void validateGetFileStatus(FileSystem pfs, Path path,
+      boolean shouldSucceed) throws IOException {
+    boolean expectedExceptionSeen = false;
+    FileStatus stat = null;
+    try {
+      stat = pfs.getFileStatus(path);
+    } catch(FileNotFoundException e) {
+      expectedExceptionSeen = true;
+    }
+    if (shouldSucceed) {
+      assertFalse("getFileStatus() has thrown Exception for valid file name "
+                  + path, expectedExceptionSeen);
+      assertNotNull("Missing file status for a valid file.", stat);
+
+      // validate fileSize
+      String[] parts = path.toUri().getPath().split("\\.");
+      long expectedFileSize = Long.valueOf(parts[parts.length - 1]);
+      assertEquals("Invalid file size.", expectedFileSize, stat.getLen());
+    } else {
+      assertTrue("getFileStatus() did not throw Exception for invalid file "
+                 + " name " + path, expectedExceptionSeen);
+    }
+  }
+
+  /**
+   * Validate if file creation succeeds for correctly formed file paths on
+   * PseudoLocalFs and also verify if appropriate exception is thrown for
+   * invalid file paths.
+   * @param pfs Pseudo Local File System
+   * @param path file path for which create() is to be called
+   * @param shouldSucceed <code>true</code> if create() should succeed
+   * @throws IOException
+   */
+  private void validateCreate(FileSystem pfs, Path path,
+      boolean shouldSucceed) throws IOException {
+    boolean expectedExceptionSeen = false;
+    try {
+      pfs.create(path);
+    } catch(IOException e) {
+      expectedExceptionSeen = true;
+    }
+    if (shouldSucceed) {
+      assertFalse("create() has thrown Exception for valid file name "
+                  + path, expectedExceptionSeen);
+    } else {
+      assertTrue("create() did not throw Exception for invalid file name "
+                 + path, expectedExceptionSeen);
+    }
+  }
+
+  /**
+   * Validate if opening of file succeeds for correctly formed file paths on
+   * PseudoLocalFs and also verify if appropriate exception is thrown for
+   * invalid file paths.
+   * @param pfs Pseudo Local File System
+   * @param path file path for which open() is to be called
+   * @param shouldSucceed <code>true</code> if open() should succeed
+   * @throws IOException
+   */
+  private void validateOpen(FileSystem pfs, Path path,
+      boolean shouldSucceed) throws IOException {
+    boolean expectedExceptionSeen = false;
+    try {
+      pfs.open(path);
+    } catch(IOException e) {
+      expectedExceptionSeen = true;
+    }
+    if (shouldSucceed) {
+      assertFalse("open() has thrown Exception for valid file name "
+                  + path, expectedExceptionSeen);
+    } else {
+      assertTrue("open() did not throw Exception for invalid file name "
+                 + path, expectedExceptionSeen);
+    }
+  }
+
+  /**
+   * Validate if exists() returns <code>true</code> for correctly formed file
+   * paths on PseudoLocalFs and returns <code>false</code> for improperly
+   * formed file paths.
+   * @param pfs Pseudo Local File System
+   * @param path file path for which exists() is to be called
+   * @param shouldSucceed expected return value of exists(&lt;path&gt;)
+   * @throws IOException
+   */
+  private void validateExists(FileSystem pfs, Path path,
+      boolean shouldSucceed) throws IOException {
+    boolean ret = pfs.exists(path);
+    if (shouldSucceed) {
+      assertTrue("exists() returned false for valid file name " + path, ret);
+    } else {
+      assertFalse("exists() returned true for invalid file name " + path, ret);
+    }
+  }
+
+  /**
+   *  Test Pseudo Local File System methods like getFileStatus(), create(),
+   *  open(), exists() for <li> valid file paths and <li> invalid file paths.
+   * @throws IOException
+   */
+  @Test
+  public void testPseudoLocalFsFileNames() throws IOException {
+    PseudoLocalFs pfs = new PseudoLocalFs();
+    Configuration conf = new Configuration();
+    conf.setClass("fs.pseudo.impl", PseudoLocalFs.class, FileSystem.class);
+
+    Path path = new Path("pseudo:///myPsedoFile.1234");
+    FileSystem testFs = path.getFileSystem(conf);
+    assertEquals("Failed to obtain a pseudo local file system object from path",
+                 pfs.getUri().getScheme(), testFs.getUri().getScheme());
+
+    // Validate PseudoLocalFS operations on URI of some other file system
+    path = new Path("file:///myPsedoFile.12345");
+    validateGetFileStatus(pfs, path, false);
+    validateCreate(pfs, path, false);
+    validateOpen(pfs, path, false);
+    validateExists(pfs, path, false);
+
+    path = new Path("pseudo:///myPsedoFile");//.<fileSize> missing
+    validateGetFileStatus(pfs, path, false);
+    validateCreate(pfs, path, false);
+    validateOpen(pfs, path, false);
+    validateExists(pfs, path, false);
+
+    // thing after final '.' is not a number
+    path = new Path("pseudo:///myPsedoFile.txt");
+    validateGetFileStatus(pfs, path, false);
+    validateCreate(pfs, path, false);
+    validateOpen(pfs, path, false);
+    validateExists(pfs, path, false);
+
+    // Generate valid file name(relative path) and validate operations on it
+    long fileSize = 231456;
+    path = PseudoLocalFs.generateFilePath("my.Psedo.File", fileSize);
+    // Validate the above generateFilePath()
+    assertEquals("generateFilePath() failed.", fileSize,
+                 pfs.validateFileNameFormat(path));
+
+    validateGetFileStatus(pfs, path, true);
+    validateCreate(pfs, path, true);
+    validateOpen(pfs, path, true);
+    validateExists(pfs, path, true);
+
+    // Validate operations on valid qualified path
+    path = new Path("myPsedoFile.1237");
+    path = path.makeQualified(pfs);
+    validateGetFileStatus(pfs, path, true);
+    validateCreate(pfs, path, true);
+    validateOpen(pfs, path, true);
+    validateExists(pfs, path, true);
+  }
+}

Modified: hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/gridmix.xml
URL: http://svn.apache.org/viewvc/hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/gridmix.xml?rev=1126499&r1=1126498&r2=1126499&view=diff
==============================================================================
--- hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/gridmix.xml (original)
+++ hadoop/mapreduce/trunk/src/docs/src/documentation/content/xdocs/gridmix.xml Mon May 23 14:22:36 2011
@@ -78,17 +78,25 @@ org.apache.hadoop.mapred.gridmix.Gridmix
 	<code>-Dgridmix.output.directory=foo</code> as given above should
 	be used <em>before</em> other GridMix parameters.
       </note>
-      <p>The <code>-generate</code> option is used to generate input data
-      for the synthetic jobs. It accepts size suffixes, e.g.
-      <code>100g</code> will generate 100 * 2<sup>30</sup> bytes.</p>
-      <p>The <code>-users</code> option is used to point to a users-list
-      file (see <a href="#usersqueues">Emulating Users and Queues</a>).</p>
-      <p>The <code>&lt;iopath&gt;</code> parameter is the destination
-      directory for generated output and/or the directory from which input
-      data will be read. Note that this can either be on the local file-system
+      <p>The <code>&lt;iopath&gt;</code> parameter is the working directory for
+      GridMix. Note that this can either be on the local file-system
       or on HDFS, but it is highly recommended that it be the same as that for
       the original job mix so that GridMix puts the same load on the local
       file-system and HDFS respectively.</p>
+      <p>The <code>-generate</code> option is used to generate input data and
+      Distributed Cache files for the synthetic jobs. It accepts standard units
+      of size suffixes, e.g. <code>100g</code> will generate
+      100 * 2<sup>30</sup> bytes as input data.
+      <code>&lt;iopath&gt;/input</code> is the destination directory for
+      generated input data and/or the directory from which input data will be
+      read. HDFS-based Distributed Cache files are generated under the
+      distributed cache directory <code>&lt;iopath&gt;/distributedCache</code>.
+      If some of the needed Distributed Cache files are already existing in the
+      distributed cache directory, then only the remaining non-existing
+      Distributed Cache files are generated when <code>-generate</code> option
+      is specified.</p>
+      <p>The <code>-users</code> option is used to point to a users-list
+      file (see <a href="#usersqueues">Emulating Users and Queues</a>).</p>
       <p>The <code>&lt;trace&gt;</code> parameter is a path to a job trace
       generated by Rumen. This trace can be compressed (it must be readable
       using one of the compression codecs supported by the cluster) or
@@ -508,6 +516,30 @@ hadoop jar &lt;gridmix-jar&gt; org.apach
       The groupnames will be ignored by Gridmix.
       </p>
     </section>
+
+  <section id="distributedcacheload">
+  <title>Emulation of Distributed Cache Load</title>
+    <p>Gridmix emulates Distributed Cache load by default for LOADJOB type of
+    jobs. This is done by precreating the needed Distributed Cache files for all
+    the simulated jobs as part of a separate MapReduce job.</p>
+    <p>Emulation of Distributed Cache load in gridmix simulated jobs can be
+    disabled by configuring the property
+    <code>gridmix.distributed-cache-emulation.enable</code> to
+    <code>false</code>.
+    But generation of Distributed Cache data by gridmix is driven by
+    <code>-generate</code> option and is independent of this configuration
+    property.</p>
+    <p>Both generation of Distributed Cache files and emulation of
+    Distributed Cache load are disabled if:</p>
+    <ul>
+    <li>input trace comes from the standard input-stream instead of file, or</li>
+    <li><code>&lt;iopath&gt;</code> specified is on local file-system, or</li>
+    <li>any of the ascendant directories of the distributed cache directory
+    i.e. <code>&lt;iopath&gt;/distributedCache</code> (including the distributed
+    cache directory) doesn't have execute permission for others.</li>
+    </ul>
+  </section>
+
     <section id="assumptions">
       <title>Simplifying Assumptions</title>
       <p>GridMix will be developed in stages, incorporating feedback and