You are viewing a plain text version of this content. The canonical link for it is here.
Posted to mapreduce-commits@hadoop.apache.org by su...@apache.org on 2012/10/22 22:43:30 UTC

svn commit: r1401071 [4/7] - in /hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project: ./ conf/ hadoop-mapreduce-client/hadoop-mapreduce-client-app/src/main/java/org/apache/hadoop/mapred/ hadoop-mapreduce-client/hadoop-mapreduce-client-app/...

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/main/java/org/apache/hadoop/examples/WordStandardDeviation.java Mon Oct 22 20:43:16 2012
@@ -1,210 +1,210 @@
-package org.apache.hadoop.examples;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.StringTokenizer;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-
-public class WordStandardDeviation extends Configured implements Tool {
-
-  private double stddev = 0;
-
-  private final static Text LENGTH = new Text("length");
-  private final static Text SQUARE = new Text("square");
-  private final static Text COUNT = new Text("count");
-  private final static LongWritable ONE = new LongWritable(1);
-
-  /**
-   * Maps words from line of text into 3 key-value pairs; one key-value pair for
-   * counting the word, one for counting its length, and one for counting the
-   * square of its length.
-   */
-  public static class WordStandardDeviationMapper extends
-      Mapper<Object, Text, Text, LongWritable> {
-
-    private LongWritable wordLen = new LongWritable();
-    private LongWritable wordLenSq = new LongWritable();
-
-    /**
-     * Emits 3 key-value pairs for counting the word, its length, and the
-     * squares of its length. Outputs are (Text, LongWritable).
-     * 
-     * @param value
-     *          This will be a line of text coming in from our input file.
-     */
-    public void map(Object key, Text value, Context context)
-        throws IOException, InterruptedException {
-      StringTokenizer itr = new StringTokenizer(value.toString());
-      while (itr.hasMoreTokens()) {
-        String string = itr.nextToken();
-
-        this.wordLen.set(string.length());
-
-        // the square of an integer is an integer...
-        this.wordLenSq.set((long) Math.pow(string.length(), 2.0));
-
-        context.write(LENGTH, this.wordLen);
-        context.write(SQUARE, this.wordLenSq);
-        context.write(COUNT, ONE);
-      }
-    }
-  }
-
-  /**
-   * Performs integer summation of all the values for each key.
-   */
-  public static class WordStandardDeviationReducer extends
-      Reducer<Text, LongWritable, Text, LongWritable> {
-
-    private LongWritable val = new LongWritable();
-
-    /**
-     * Sums all the individual values within the iterator and writes them to the
-     * same key.
-     * 
-     * @param key
-     *          This will be one of 2 constants: LENGTH_STR, COUNT_STR, or
-     *          SQUARE_STR.
-     * @param values
-     *          This will be an iterator of all the values associated with that
-     *          key.
-     */
-    public void reduce(Text key, Iterable<LongWritable> values, Context context)
-        throws IOException, InterruptedException {
-
-      int sum = 0;
-      for (LongWritable value : values) {
-        sum += value.get();
-      }
-      val.set(sum);
-      context.write(key, val);
-    }
-  }
-
-  /**
-   * Reads the output file and parses the summation of lengths, the word count,
-   * and the lengths squared, to perform a quick calculation of the standard
-   * deviation.
-   * 
-   * @param path
-   *          The path to find the output file in. Set in main to the output
-   *          directory.
-   * @throws IOException
-   *           If it cannot access the output directory, we throw an exception.
-   */
-  private double readAndCalcStdDev(Path path, Configuration conf)
-      throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-    Path file = new Path(path, "part-r-00000");
-
-    if (!fs.exists(file))
-      throw new IOException("Output not found!");
-
-    double stddev = 0;
-    BufferedReader br = null;
-    try {
-      br = new BufferedReader(new InputStreamReader(fs.open(file)));
-      long count = 0;
-      long length = 0;
-      long square = 0;
-      String line;
-      while ((line = br.readLine()) != null) {
-        StringTokenizer st = new StringTokenizer(line);
-
-        // grab type
-        String type = st.nextToken();
-
-        // differentiate
-        if (type.equals(COUNT.toString())) {
-          String countLit = st.nextToken();
-          count = Long.parseLong(countLit);
-        } else if (type.equals(LENGTH.toString())) {
-          String lengthLit = st.nextToken();
-          length = Long.parseLong(lengthLit);
-        } else if (type.equals(SQUARE.toString())) {
-          String squareLit = st.nextToken();
-          square = Long.parseLong(squareLit);
-        }
-      }
-      // average = total sum / number of elements;
-      double mean = (((double) length) / ((double) count));
-      // standard deviation = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2))
-      mean = Math.pow(mean, 2.0);
-      double term = (((double) square / ((double) count)));
-      stddev = Math.sqrt((term - mean));
-      System.out.println("The standard deviation is: " + stddev);
-    } finally {
-      br.close();
-    }
-    return stddev;
-  }
-
-  public static void main(String[] args) throws Exception {
-    ToolRunner.run(new Configuration(), new WordStandardDeviation(),
-        args);
-  }
-
-  @Override
-  public int run(String[] args) throws Exception {
-    if (args.length != 2) {
-      System.err.println("Usage: wordstddev <in> <out>");
-      return 0;
-    }
-
-    Configuration conf = getConf();
-
-    @SuppressWarnings("deprecation")
-    Job job = new Job(conf, "word stddev");
-    job.setJarByClass(WordStandardDeviation.class);
-    job.setMapperClass(WordStandardDeviationMapper.class);
-    job.setCombinerClass(WordStandardDeviationReducer.class);
-    job.setReducerClass(WordStandardDeviationReducer.class);
-    job.setOutputKeyClass(Text.class);
-    job.setOutputValueClass(LongWritable.class);
-    FileInputFormat.addInputPath(job, new Path(args[0]));
-    Path outputpath = new Path(args[1]);
-    FileOutputFormat.setOutputPath(job, outputpath);
-    boolean result = job.waitForCompletion(true);
-
-    // read output and calculate standard deviation
-    stddev = readAndCalcStdDev(outputpath, conf);
-
-    return (result ? 0 : 1);
-  }
-
-  public double getStandardDeviation() {
-    return stddev;
-  }
+package org.apache.hadoop.examples;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+public class WordStandardDeviation extends Configured implements Tool {
+
+  private double stddev = 0;
+
+  private final static Text LENGTH = new Text("length");
+  private final static Text SQUARE = new Text("square");
+  private final static Text COUNT = new Text("count");
+  private final static LongWritable ONE = new LongWritable(1);
+
+  /**
+   * Maps words from line of text into 3 key-value pairs; one key-value pair for
+   * counting the word, one for counting its length, and one for counting the
+   * square of its length.
+   */
+  public static class WordStandardDeviationMapper extends
+      Mapper<Object, Text, Text, LongWritable> {
+
+    private LongWritable wordLen = new LongWritable();
+    private LongWritable wordLenSq = new LongWritable();
+
+    /**
+     * Emits 3 key-value pairs for counting the word, its length, and the
+     * squares of its length. Outputs are (Text, LongWritable).
+     * 
+     * @param value
+     *          This will be a line of text coming in from our input file.
+     */
+    public void map(Object key, Text value, Context context)
+        throws IOException, InterruptedException {
+      StringTokenizer itr = new StringTokenizer(value.toString());
+      while (itr.hasMoreTokens()) {
+        String string = itr.nextToken();
+
+        this.wordLen.set(string.length());
+
+        // the square of an integer is an integer...
+        this.wordLenSq.set((long) Math.pow(string.length(), 2.0));
+
+        context.write(LENGTH, this.wordLen);
+        context.write(SQUARE, this.wordLenSq);
+        context.write(COUNT, ONE);
+      }
+    }
+  }
+
+  /**
+   * Performs integer summation of all the values for each key.
+   */
+  public static class WordStandardDeviationReducer extends
+      Reducer<Text, LongWritable, Text, LongWritable> {
+
+    private LongWritable val = new LongWritable();
+
+    /**
+     * Sums all the individual values within the iterator and writes them to the
+     * same key.
+     * 
+     * @param key
+     *          This will be one of 2 constants: LENGTH_STR, COUNT_STR, or
+     *          SQUARE_STR.
+     * @param values
+     *          This will be an iterator of all the values associated with that
+     *          key.
+     */
+    public void reduce(Text key, Iterable<LongWritable> values, Context context)
+        throws IOException, InterruptedException {
+
+      int sum = 0;
+      for (LongWritable value : values) {
+        sum += value.get();
+      }
+      val.set(sum);
+      context.write(key, val);
+    }
+  }
+
+  /**
+   * Reads the output file and parses the summation of lengths, the word count,
+   * and the lengths squared, to perform a quick calculation of the standard
+   * deviation.
+   * 
+   * @param path
+   *          The path to find the output file in. Set in main to the output
+   *          directory.
+   * @throws IOException
+   *           If it cannot access the output directory, we throw an exception.
+   */
+  private double readAndCalcStdDev(Path path, Configuration conf)
+      throws IOException {
+    FileSystem fs = FileSystem.get(conf);
+    Path file = new Path(path, "part-r-00000");
+
+    if (!fs.exists(file))
+      throw new IOException("Output not found!");
+
+    double stddev = 0;
+    BufferedReader br = null;
+    try {
+      br = new BufferedReader(new InputStreamReader(fs.open(file)));
+      long count = 0;
+      long length = 0;
+      long square = 0;
+      String line;
+      while ((line = br.readLine()) != null) {
+        StringTokenizer st = new StringTokenizer(line);
+
+        // grab type
+        String type = st.nextToken();
+
+        // differentiate
+        if (type.equals(COUNT.toString())) {
+          String countLit = st.nextToken();
+          count = Long.parseLong(countLit);
+        } else if (type.equals(LENGTH.toString())) {
+          String lengthLit = st.nextToken();
+          length = Long.parseLong(lengthLit);
+        } else if (type.equals(SQUARE.toString())) {
+          String squareLit = st.nextToken();
+          square = Long.parseLong(squareLit);
+        }
+      }
+      // average = total sum / number of elements;
+      double mean = (((double) length) / ((double) count));
+      // standard deviation = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2))
+      mean = Math.pow(mean, 2.0);
+      double term = (((double) square / ((double) count)));
+      stddev = Math.sqrt((term - mean));
+      System.out.println("The standard deviation is: " + stddev);
+    } finally {
+      br.close();
+    }
+    return stddev;
+  }
+
+  public static void main(String[] args) throws Exception {
+    ToolRunner.run(new Configuration(), new WordStandardDeviation(),
+        args);
+  }
+
+  @Override
+  public int run(String[] args) throws Exception {
+    if (args.length != 2) {
+      System.err.println("Usage: wordstddev <in> <out>");
+      return 0;
+    }
+
+    Configuration conf = getConf();
+
+    @SuppressWarnings("deprecation")
+    Job job = new Job(conf, "word stddev");
+    job.setJarByClass(WordStandardDeviation.class);
+    job.setMapperClass(WordStandardDeviationMapper.class);
+    job.setCombinerClass(WordStandardDeviationReducer.class);
+    job.setReducerClass(WordStandardDeviationReducer.class);
+    job.setOutputKeyClass(Text.class);
+    job.setOutputValueClass(LongWritable.class);
+    FileInputFormat.addInputPath(job, new Path(args[0]));
+    Path outputpath = new Path(args[1]);
+    FileOutputFormat.setOutputPath(job, outputpath);
+    boolean result = job.waitForCompletion(true);
+
+    // read output and calculate standard deviation
+    stddev = readAndCalcStdDev(outputpath, conf);
+
+    return (result ? 0 : 1);
+  }
+
+  public double getStandardDeviation() {
+    return stddev;
+  }
 }
\ No newline at end of file

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/hadoop-mapreduce-examples/src/test/java/org/apache/hadoop/examples/TestWordStats.java Mon Oct 22 20:43:16 2012
@@ -1,272 +1,272 @@
-package org.apache.hadoop.examples;
-
-import static org.junit.Assert.assertEquals;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.StringTokenizer;
-import java.util.TreeMap;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ToolRunner;
-import org.junit.Before;
-import org.junit.Test;
-
-public class TestWordStats {
-
-  private final static String INPUT = "src/test/java/org/apache/hadoop/examples/pi/math";
-  private final static String MEAN_OUTPUT = "build/data/mean_output";
-  private final static String MEDIAN_OUTPUT = "build/data/median_output";
-  private final static String STDDEV_OUTPUT = "build/data/stddev_output";
-
-  /**
-   * Modified internal test class that is designed to read all the files in the
-   * input directory, and find the standard deviation between all of the word
-   * lengths.
-   */
-  public static class WordStdDevReader {
-    private long wordsRead = 0;
-    private long wordLengthsRead = 0;
-    private long wordLengthsReadSquared = 0;
-
-    public WordStdDevReader() {
-    }
-
-    public double read(String path) throws IOException {
-      FileSystem fs = FileSystem.get(new Configuration());
-      FileStatus[] files = fs.listStatus(new Path(path));
-
-      for (FileStatus fileStat : files) {
-        if (!fileStat.isFile())
-          continue;
-
-        BufferedReader br = null;
-
-        try {
-          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));
-
-          String line;
-          while ((line = br.readLine()) != null) {
-            StringTokenizer st = new StringTokenizer(line);
-            String word;
-            while (st.hasMoreTokens()) {
-              word = st.nextToken();
-              this.wordsRead++;
-              this.wordLengthsRead += word.length();
-              this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0);
-            }
-          }
-
-        } catch (IOException e) {
-          System.out.println("Output could not be read!");
-          throw e;
-        } finally {
-          br.close();
-        }
-      }
-
-      double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead));
-      mean = Math.pow(mean, 2.0);
-      double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead)));
-      double stddev = Math.sqrt((term - mean));
-      return stddev;
-    }
-
-  }
-
-  /**
-   * Modified internal test class that is designed to read all the files in the
-   * input directory, and find the median length of all the words.
-   */
-  public static class WordMedianReader {
-    private long wordsRead = 0;
-    private TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
-
-    public WordMedianReader() {
-    }
-
-    public double read(String path) throws IOException {
-      FileSystem fs = FileSystem.get(new Configuration());
-      FileStatus[] files = fs.listStatus(new Path(path));
-
-      int num = 0;
-
-      for (FileStatus fileStat : files) {
-        if (!fileStat.isFile())
-          continue;
-
-        BufferedReader br = null;
-
-        try {
-          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));
-
-          String line;
-          while ((line = br.readLine()) != null) {
-            StringTokenizer st = new StringTokenizer(line);
-            String word;
-            while (st.hasMoreTokens()) {
-              word = st.nextToken();
-              this.wordsRead++;
-              if (this.map.get(word.length()) == null) {
-                this.map.put(word.length(), 1);
-              } else {
-                int count = this.map.get(word.length());
-                this.map.put(word.length(), count + 1);
-              }
-            }
-          }
-        } catch (IOException e) {
-          System.out.println("Output could not be read!");
-          throw e;
-        } finally {
-          br.close();
-        }
-      }
-
-      int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0));
-      int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0));
-
-      for (Integer key : this.map.navigableKeySet()) {
-        int prevNum = num;
-        num += this.map.get(key);
-
-        if (medianIndex2 >= prevNum && medianIndex1 <= num) {
-          return key;
-        } else if (medianIndex2 >= prevNum && medianIndex1 < num) {
-          Integer nextCurrLen = this.map.navigableKeySet().iterator().next();
-          double median = (key + nextCurrLen) / 2.0;
-          return median;
-        }
-      }
-      return -1;
-    }
-
-  }
-
-  /**
-   * Modified internal test class that is designed to read all the files in the
-   * input directory, and find the mean length of all the words.
-   */
-  public static class WordMeanReader {
-    private long wordsRead = 0;
-    private long wordLengthsRead = 0;
-
-    public WordMeanReader() {
-    }
-
-    public double read(String path) throws IOException {
-      FileSystem fs = FileSystem.get(new Configuration());
-      FileStatus[] files = fs.listStatus(new Path(path));
-
-      for (FileStatus fileStat : files) {
-        if (!fileStat.isFile())
-          continue;
-
-        BufferedReader br = null;
-
-        try {
-          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));
-
-          String line;
-          while ((line = br.readLine()) != null) {
-            StringTokenizer st = new StringTokenizer(line);
-            String word;
-            while (st.hasMoreTokens()) {
-              word = st.nextToken();
-              this.wordsRead++;
-              this.wordLengthsRead += word.length();
-            }
-          }
-        } catch (IOException e) {
-          System.out.println("Output could not be read!");
-          throw e;
-        } finally {
-          br.close();
-        }
-      }
-
-      double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead));
-      return mean;
-    }
-
-  }
-
-  /**
-   * Internal class designed to delete the output directory. Meant solely for
-   * use before and after the test is run; this is so next iterations of the
-   * test do not encounter a "file already exists" error.
-   * 
-   * @param dir
-   *          The directory to delete.
-   * @return Returns whether the deletion was successful or not.
-   */
-  public static boolean deleteDir(File dir) {
-    if (dir.isDirectory()) {
-      String[] children = dir.list();
-      for (int i = 0; i < children.length; i++) {
-        boolean success = deleteDir(new File(dir, children[i]));
-        if (!success) {
-          System.out.println("Could not delete directory after test!");
-          return false;
-        }
-      }
-    }
-
-    // The directory is now empty so delete it
-    return dir.delete();
-  }
-
-  @Before public void setup() throws Exception {
-    deleteDir(new File(MEAN_OUTPUT));
-    deleteDir(new File(MEDIAN_OUTPUT));
-    deleteDir(new File(STDDEV_OUTPUT));
-  }
-
-  @Test public void testGetTheMean() throws Exception {
-    String args[] = new String[2];
-    args[0] = INPUT;
-    args[1] = MEAN_OUTPUT;
-
-    WordMean wm = new WordMean();
-    ToolRunner.run(new Configuration(), wm, args);
-    double mean = wm.getMean();
-
-    // outputs MUST match
-    WordMeanReader wr = new WordMeanReader();
-    assertEquals(mean, wr.read(INPUT), 0.0);
-  }
-
-  @Test public void testGetTheMedian() throws Exception {
-    String args[] = new String[2];
-    args[0] = INPUT;
-    args[1] = MEDIAN_OUTPUT;
-
-    WordMedian wm = new WordMedian();
-    ToolRunner.run(new Configuration(), wm, args);
-    double median = wm.getMedian();
-
-    // outputs MUST match
-    WordMedianReader wr = new WordMedianReader();
-    assertEquals(median, wr.read(INPUT), 0.0);
-  }
-
-  @Test public void testGetTheStandardDeviation() throws Exception {
-    String args[] = new String[2];
-    args[0] = INPUT;
-    args[1] = STDDEV_OUTPUT;
-
-    WordStandardDeviation wsd = new WordStandardDeviation();
-    ToolRunner.run(new Configuration(), wsd, args);
-    double stddev = wsd.getStandardDeviation();
-
-    // outputs MUST match
-    WordStdDevReader wr = new WordStdDevReader();
-    assertEquals(stddev, wr.read(INPUT), 0.0);
-  }
-
-}
+package org.apache.hadoop.examples;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.StringTokenizer;
+import java.util.TreeMap;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.util.ToolRunner;
+import org.junit.Before;
+import org.junit.Test;
+
+public class TestWordStats {
+
+  private final static String INPUT = "src/test/java/org/apache/hadoop/examples/pi/math";
+  private final static String MEAN_OUTPUT = "build/data/mean_output";
+  private final static String MEDIAN_OUTPUT = "build/data/median_output";
+  private final static String STDDEV_OUTPUT = "build/data/stddev_output";
+
+  /**
+   * Modified internal test class that is designed to read all the files in the
+   * input directory, and find the standard deviation between all of the word
+   * lengths.
+   */
+  public static class WordStdDevReader {
+    private long wordsRead = 0;
+    private long wordLengthsRead = 0;
+    private long wordLengthsReadSquared = 0;
+
+    public WordStdDevReader() {
+    }
+
+    public double read(String path) throws IOException {
+      FileSystem fs = FileSystem.get(new Configuration());
+      FileStatus[] files = fs.listStatus(new Path(path));
+
+      for (FileStatus fileStat : files) {
+        if (!fileStat.isFile())
+          continue;
+
+        BufferedReader br = null;
+
+        try {
+          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));
+
+          String line;
+          while ((line = br.readLine()) != null) {
+            StringTokenizer st = new StringTokenizer(line);
+            String word;
+            while (st.hasMoreTokens()) {
+              word = st.nextToken();
+              this.wordsRead++;
+              this.wordLengthsRead += word.length();
+              this.wordLengthsReadSquared += (long) Math.pow(word.length(), 2.0);
+            }
+          }
+
+        } catch (IOException e) {
+          System.out.println("Output could not be read!");
+          throw e;
+        } finally {
+          br.close();
+        }
+      }
+
+      double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead));
+      mean = Math.pow(mean, 2.0);
+      double term = (((double) this.wordLengthsReadSquared / ((double) this.wordsRead)));
+      double stddev = Math.sqrt((term - mean));
+      return stddev;
+    }
+
+  }
+
+  /**
+   * Modified internal test class that is designed to read all the files in the
+   * input directory, and find the median length of all the words.
+   */
+  public static class WordMedianReader {
+    private long wordsRead = 0;
+    private TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>();
+
+    public WordMedianReader() {
+    }
+
+    public double read(String path) throws IOException {
+      FileSystem fs = FileSystem.get(new Configuration());
+      FileStatus[] files = fs.listStatus(new Path(path));
+
+      int num = 0;
+
+      for (FileStatus fileStat : files) {
+        if (!fileStat.isFile())
+          continue;
+
+        BufferedReader br = null;
+
+        try {
+          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));
+
+          String line;
+          while ((line = br.readLine()) != null) {
+            StringTokenizer st = new StringTokenizer(line);
+            String word;
+            while (st.hasMoreTokens()) {
+              word = st.nextToken();
+              this.wordsRead++;
+              if (this.map.get(word.length()) == null) {
+                this.map.put(word.length(), 1);
+              } else {
+                int count = this.map.get(word.length());
+                this.map.put(word.length(), count + 1);
+              }
+            }
+          }
+        } catch (IOException e) {
+          System.out.println("Output could not be read!");
+          throw e;
+        } finally {
+          br.close();
+        }
+      }
+
+      int medianIndex1 = (int) Math.ceil((this.wordsRead / 2.0));
+      int medianIndex2 = (int) Math.floor((this.wordsRead / 2.0));
+
+      for (Integer key : this.map.navigableKeySet()) {
+        int prevNum = num;
+        num += this.map.get(key);
+
+        if (medianIndex2 >= prevNum && medianIndex1 <= num) {
+          return key;
+        } else if (medianIndex2 >= prevNum && medianIndex1 < num) {
+          Integer nextCurrLen = this.map.navigableKeySet().iterator().next();
+          double median = (key + nextCurrLen) / 2.0;
+          return median;
+        }
+      }
+      return -1;
+    }
+
+  }
+
+  /**
+   * Modified internal test class that is designed to read all the files in the
+   * input directory, and find the mean length of all the words.
+   */
+  public static class WordMeanReader {
+    private long wordsRead = 0;
+    private long wordLengthsRead = 0;
+
+    public WordMeanReader() {
+    }
+
+    public double read(String path) throws IOException {
+      FileSystem fs = FileSystem.get(new Configuration());
+      FileStatus[] files = fs.listStatus(new Path(path));
+
+      for (FileStatus fileStat : files) {
+        if (!fileStat.isFile())
+          continue;
+
+        BufferedReader br = null;
+
+        try {
+          br = new BufferedReader(new InputStreamReader(fs.open(fileStat.getPath())));
+
+          String line;
+          while ((line = br.readLine()) != null) {
+            StringTokenizer st = new StringTokenizer(line);
+            String word;
+            while (st.hasMoreTokens()) {
+              word = st.nextToken();
+              this.wordsRead++;
+              this.wordLengthsRead += word.length();
+            }
+          }
+        } catch (IOException e) {
+          System.out.println("Output could not be read!");
+          throw e;
+        } finally {
+          br.close();
+        }
+      }
+
+      double mean = (((double) this.wordLengthsRead) / ((double) this.wordsRead));
+      return mean;
+    }
+
+  }
+
+  /**
+   * Internal class designed to delete the output directory. Meant solely for
+   * use before and after the test is run; this is so next iterations of the
+   * test do not encounter a "file already exists" error.
+   * 
+   * @param dir
+   *          The directory to delete.
+   * @return Returns whether the deletion was successful or not.
+   */
+  public static boolean deleteDir(File dir) {
+    if (dir.isDirectory()) {
+      String[] children = dir.list();
+      for (int i = 0; i < children.length; i++) {
+        boolean success = deleteDir(new File(dir, children[i]));
+        if (!success) {
+          System.out.println("Could not delete directory after test!");
+          return false;
+        }
+      }
+    }
+
+    // The directory is now empty so delete it
+    return dir.delete();
+  }
+
+  @Before public void setup() throws Exception {
+    deleteDir(new File(MEAN_OUTPUT));
+    deleteDir(new File(MEDIAN_OUTPUT));
+    deleteDir(new File(STDDEV_OUTPUT));
+  }
+
+  @Test public void testGetTheMean() throws Exception {
+    String args[] = new String[2];
+    args[0] = INPUT;
+    args[1] = MEAN_OUTPUT;
+
+    WordMean wm = new WordMean();
+    ToolRunner.run(new Configuration(), wm, args);
+    double mean = wm.getMean();
+
+    // outputs MUST match
+    WordMeanReader wr = new WordMeanReader();
+    assertEquals(mean, wr.read(INPUT), 0.0);
+  }
+
+  @Test public void testGetTheMedian() throws Exception {
+    String args[] = new String[2];
+    args[0] = INPUT;
+    args[1] = MEDIAN_OUTPUT;
+
+    WordMedian wm = new WordMedian();
+    ToolRunner.run(new Configuration(), wm, args);
+    double median = wm.getMedian();
+
+    // outputs MUST match
+    WordMedianReader wr = new WordMedianReader();
+    assertEquals(median, wr.read(INPUT), 0.0);
+  }
+
+  @Test public void testGetTheStandardDeviation() throws Exception {
+    String args[] = new String[2];
+    args[0] = INPUT;
+    args[1] = STDDEV_OUTPUT;
+
+    WordStandardDeviation wsd = new WordStandardDeviation();
+    ToolRunner.run(new Configuration(), wsd, args);
+    double stddev = wsd.getStandardDeviation();
+
+    // outputs MUST match
+    WordStdDevReader wr = new WordStdDevReader();
+    assertEquals(stddev, wr.read(INPUT), 0.0);
+  }
+
+}

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/c++/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/c++:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/block_forensics/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/block_forensics:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/build-contrib.xml
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/build-contrib.xml:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/build.xml
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/build.xml:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/data_join/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/data_join:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/eclipse-plugin/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/eclipse-plugin:r1397381-1401062

Propchange: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/
------------------------------------------------------------------------------
  Merged /hadoop/common/trunk/hadoop-mapreduce-project/src/contrib/index:r1397381-1401062

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data.txt?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data.txt (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data.txt Mon Oct 22 20:43:16 2012
@@ -1,10 +1,10 @@
-0 ins apache dot org
-1 ins apache
-2 ins apache
-3 ins apache
-4 ins apache
-5 ins apache
-6 ins apache
-7 ins apache
-8 ins apache
-9 ins apache
+0 ins apache dot org
+1 ins apache
+2 ins apache
+3 ins apache
+4 ins apache
+5 ins apache
+6 ins apache
+7 ins apache
+8 ins apache
+9 ins apache

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/sample/data2.txt Mon Oct 22 20:43:16 2012
@@ -1,10 +1,10 @@
-0 del
-1 upd hadoop
-2 del
-3 upd hadoop
-4 del
-5 upd hadoop
-6 del
-7 upd hadoop
-8 del
-9 upd hadoop
+0 del
+1 upd hadoop
+2 del
+3 upd hadoop
+4 del
+5 upd hadoop
+6 del
+7 upd hadoop
+8 del
+9 upd hadoop

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java Mon Oct 22 20:43:16 2012
@@ -1,56 +1,56 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import org.apache.hadoop.contrib.index.mapred.DocumentID;
-import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy;
-import org.apache.hadoop.contrib.index.mapred.Shard;
-
-/**
- * Choose a shard for each insert or delete based on document id hashing. Do
- * NOT use this distribution policy when the number of shards changes.
- */
-public class HashingDistributionPolicy implements IDistributionPolicy {
-
-  private int numShards;
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[])
-   */
-  public void init(Shard[] shards) {
-    numShards = shards.length;
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID)
-   */
-  public int chooseShardForInsert(DocumentID key) {
-    int hashCode = key.hashCode();
-    return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards;
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID)
-   */
-  public int chooseShardForDelete(DocumentID key) {
-    int hashCode = key.hashCode();
-    return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards;
-  }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy;
+import org.apache.hadoop.contrib.index.mapred.Shard;
+
+/**
+ * Choose a shard for each insert or delete based on document id hashing. Do
+ * NOT use this distribution policy when the number of shards changes.
+ */
+public class HashingDistributionPolicy implements IDistributionPolicy {
+
+  private int numShards;
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[])
+   */
+  public void init(Shard[] shards) {
+    numShards = shards.length;
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID)
+   */
+  public int chooseShardForInsert(DocumentID key) {
+    int hashCode = key.hashCode();
+    return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards;
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID)
+   */
+  public int chooseShardForDelete(DocumentID key) {
+    int hashCode = key.hashCode();
+    return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards;
+  }
+
+}

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java Mon Oct 22 20:43:16 2012
@@ -1,57 +1,57 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import java.io.IOException;
-
-import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
-import org.apache.hadoop.contrib.index.mapred.DocumentID;
-import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reporter;
-
-/**
- * Identity local analysis maps inputs directly into outputs.
- */
-public class IdentityLocalAnalysis implements
-    ILocalAnalysis<DocumentID, DocumentAndOp> {
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
-   */
-  public void map(DocumentID key, DocumentAndOp value,
-      OutputCollector<DocumentID, DocumentAndOp> output, Reporter reporter)
-      throws IOException {
-    output.collect(key, value);
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
-   */
-  public void configure(JobConf job) {
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.io.Closeable#close()
-   */
-  public void close() throws IOException {
-  }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Identity local analysis maps inputs directly into outputs.
+ */
+public class IdentityLocalAnalysis implements
+    ILocalAnalysis<DocumentID, DocumentAndOp> {
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
+   */
+  public void map(DocumentID key, DocumentAndOp value,
+      OutputCollector<DocumentID, DocumentAndOp> output, Reporter reporter)
+      throws IOException {
+    output.collect(key, value);
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
+   */
+  public void configure(JobConf job) {
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.io.Closeable#close()
+   */
+  public void close() throws IOException {
+  }
+
+}

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java Mon Oct 22 20:43:16 2012
@@ -1,46 +1,46 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import java.io.IOException;
-
-import org.apache.hadoop.contrib.index.mapred.DocumentID;
-import org.apache.hadoop.mapred.FileInputFormat;
-import org.apache.hadoop.mapred.FileSplit;
-import org.apache.hadoop.mapred.InputSplit;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.RecordReader;
-import org.apache.hadoop.mapred.Reporter;
-
-/**
- * An InputFormat for LineDoc for plain text files where each line is a doc.
- */
-public class LineDocInputFormat extends
-    FileInputFormat<DocumentID, LineDocTextAndOp> {
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.FileInputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit, org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter)
-   */
-  public RecordReader<DocumentID, LineDocTextAndOp> getRecordReader(
-      InputSplit split, JobConf job, Reporter reporter) throws IOException {
-    reporter.setStatus(split.toString());
-    return new LineDocRecordReader(job, (FileSplit) split);
-  }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * An InputFormat for LineDoc for plain text files where each line is a doc.
+ */
+public class LineDocInputFormat extends
+    FileInputFormat<DocumentID, LineDocTextAndOp> {
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.FileInputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit, org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter)
+   */
+  public RecordReader<DocumentID, LineDocTextAndOp> getRecordReader(
+      InputSplit split, JobConf job, Reporter reporter) throws IOException {
+    reporter.setStatus(split.toString());
+    return new LineDocRecordReader(job, (FileSplit) split);
+  }
+
+}

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java Mon Oct 22 20:43:16 2012
@@ -1,80 +1,80 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import java.io.IOException;
-
-import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
-import org.apache.hadoop.contrib.index.mapred.DocumentID;
-import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis;
-import org.apache.hadoop.mapred.JobConf;
-import org.apache.hadoop.mapred.OutputCollector;
-import org.apache.hadoop.mapred.Reporter;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.index.Term;
-
-/**
- * Convert LineDocTextAndOp to DocumentAndOp as required by ILocalAnalysis.
- */
-public class LineDocLocalAnalysis implements
-    ILocalAnalysis<DocumentID, LineDocTextAndOp> {
-
-  private static String docidFieldName = "id";
-  private static String contentFieldName = "content";
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
-   */
-  public void map(DocumentID key, LineDocTextAndOp value,
-      OutputCollector<DocumentID, DocumentAndOp> output, Reporter reporter)
-      throws IOException {
-
-    DocumentAndOp.Op op = value.getOp();
-    Document doc = null;
-    Term term = null;
-
-    if (op == DocumentAndOp.Op.INSERT || op == DocumentAndOp.Op.UPDATE) {
-      doc = new Document();
-      doc.add(new Field(docidFieldName, key.getText().toString(),
-          Field.Store.YES, Field.Index.UN_TOKENIZED));
-      doc.add(new Field(contentFieldName, value.getText().toString(),
-          Field.Store.NO, Field.Index.TOKENIZED));
-    }
-
-    if (op == DocumentAndOp.Op.DELETE || op == DocumentAndOp.Op.UPDATE) {
-      term = new Term(docidFieldName, key.getText().toString());
-    }
-
-    output.collect(key, new DocumentAndOp(op, doc, term));
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
-   */
-  public void configure(JobConf job) {
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.io.Closeable#close()
-   */
-  public void close() throws IOException {
-  }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.Term;
+
+/**
+ * Convert LineDocTextAndOp to DocumentAndOp as required by ILocalAnalysis.
+ */
+public class LineDocLocalAnalysis implements
+    ILocalAnalysis<DocumentID, LineDocTextAndOp> {
+
+  private static String docidFieldName = "id";
+  private static String contentFieldName = "content";
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
+   */
+  public void map(DocumentID key, LineDocTextAndOp value,
+      OutputCollector<DocumentID, DocumentAndOp> output, Reporter reporter)
+      throws IOException {
+
+    DocumentAndOp.Op op = value.getOp();
+    Document doc = null;
+    Term term = null;
+
+    if (op == DocumentAndOp.Op.INSERT || op == DocumentAndOp.Op.UPDATE) {
+      doc = new Document();
+      doc.add(new Field(docidFieldName, key.getText().toString(),
+          Field.Store.YES, Field.Index.UN_TOKENIZED));
+      doc.add(new Field(contentFieldName, value.getText().toString(),
+          Field.Store.NO, Field.Index.TOKENIZED));
+    }
+
+    if (op == DocumentAndOp.Op.DELETE || op == DocumentAndOp.Op.UPDATE) {
+      term = new Term(docidFieldName, key.getText().toString());
+    }
+
+    output.collect(key, new DocumentAndOp(op, doc, term));
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
+   */
+  public void configure(JobConf job) {
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.io.Closeable#close()
+   */
+  public void close() throws IOException {
+  }
+
+}

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java Mon Oct 22 20:43:16 2012
@@ -1,231 +1,231 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
-import org.apache.hadoop.contrib.index.mapred.DocumentID;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapred.FileSplit;
-import org.apache.hadoop.mapred.RecordReader;
-
-/**
- * A simple RecordReader for LineDoc for plain text files where each line is a
- * doc. Each line is as follows: documentID<SPACE>op<SPACE>content<EOF>,
- * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete"
- * for delete, or "u", "upd" or "update" for update.
- */
-public class LineDocRecordReader implements
-    RecordReader<DocumentID, LineDocTextAndOp> {
-  private static final char SPACE = ' ';
-  private static final char EOL = '\n';
-
-  private long start;
-  private long pos;
-  private long end;
-  private BufferedInputStream in;
-  private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);
-
-  /**
-   * Provide a bridge to get the bytes from the ByteArrayOutputStream without
-   * creating a new byte array.
-   */
-  private static class TextStuffer extends OutputStream {
-    public Text target;
-
-    public void write(int b) {
-      throw new UnsupportedOperationException("write(byte) not supported");
-    }
-
-    public void write(byte[] data, int offset, int len) throws IOException {
-      target.set(data, offset, len);
-    }
-  }
-
-  private TextStuffer bridge = new TextStuffer();
-
-  /**
-   * Constructor
-   * @param job
-   * @param split  
-   * @throws IOException
-   */
-  public LineDocRecordReader(Configuration job, FileSplit split)
-      throws IOException {
-    long start = split.getStart();
-    long end = start + split.getLength();
-    final Path file = split.getPath();
-
-    // open the file and seek to the start of the split
-    FileSystem fs = file.getFileSystem(job);
-    FSDataInputStream fileIn = fs.open(split.getPath());
-    InputStream in = fileIn;
-    boolean skipFirstLine = false;
-    if (start != 0) {
-      skipFirstLine = true; // wait till BufferedInputStream to skip
-      --start;
-      fileIn.seek(start);
-    }
-
-    this.in = new BufferedInputStream(in);
-    if (skipFirstLine) { // skip first line and re-establish "start".
-      start += LineDocRecordReader.readData(this.in, null, EOL);
-    }
-    this.start = start;
-    this.pos = start;
-    this.end = end;
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.RecordReader#close()
-   */
-  public void close() throws IOException {
-    in.close();
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.RecordReader#createKey()
-   */
-  public DocumentID createKey() {
-    return new DocumentID();
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.RecordReader#createValue()
-   */
-  public LineDocTextAndOp createValue() {
-    return new LineDocTextAndOp();
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.RecordReader#getPos()
-   */
-  public long getPos() throws IOException {
-    return pos;
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.RecordReader#getProgress()
-   */
-  public float getProgress() throws IOException {
-    if (start == end) {
-      return 0.0f;
-    } else {
-      return Math.min(1.0f, (pos - start) / (float) (end - start));
-    }
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object)
-   */
-  public synchronized boolean next(DocumentID key, LineDocTextAndOp value)
-      throws IOException {
-    if (pos >= end) {
-      return false;
-    }
-
-    // key is document id, which are bytes until first space
-    if (!readInto(key.getText(), SPACE)) {
-      return false;
-    }
-
-    // read operation: i/d/u, or ins/del/upd, or insert/delete/update
-    Text opText = new Text();
-    if (!readInto(opText, SPACE)) {
-      return false;
-    }
-    String opStr = opText.toString();
-    DocumentAndOp.Op op;
-    if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) {
-      op = DocumentAndOp.Op.INSERT;
-    } else if (opStr.equals("d") || opStr.equals("del")
-        || opStr.equals("delete")) {
-      op = DocumentAndOp.Op.DELETE;
-    } else if (opStr.equals("u") || opStr.equals("upd")
-        || opStr.equals("update")) {
-      op = DocumentAndOp.Op.UPDATE;
-    } else {
-      // default is insert
-      op = DocumentAndOp.Op.INSERT;
-    }
-    value.setOp(op);
-
-    if (op == DocumentAndOp.Op.DELETE) {
-      return true;
-    } else {
-      // read rest of the line
-      return readInto(value.getText(), EOL);
-    }
-  }
-
-  private boolean readInto(Text text, char delimiter) throws IOException {
-    buffer.reset();
-    long bytesRead = readData(in, buffer, delimiter);
-    if (bytesRead == 0) {
-      return false;
-    }
-    pos += bytesRead;
-    bridge.target = text;
-    buffer.writeTo(bridge);
-    return true;
-  }
-
-  private static long readData(InputStream in, OutputStream out, char delimiter)
-      throws IOException {
-    long bytes = 0;
-    while (true) {
-
-      int b = in.read();
-      if (b == -1) {
-        break;
-      }
-      bytes += 1;
-
-      byte c = (byte) b;
-      if (c == EOL || c == delimiter) {
-        break;
-      }
-
-      if (c == '\r') {
-        in.mark(1);
-        byte nextC = (byte) in.read();
-        if (nextC != EOL || c == delimiter) {
-          in.reset();
-        } else {
-          bytes += 1;
-        }
-        break;
-      }
-
-      if (out != null) {
-        out.write(c);
-      }
-    }
-    return bytes;
-  }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.RecordReader;
+
+/**
+ * A simple RecordReader for LineDoc for plain text files where each line is a
+ * doc. Each line is as follows: documentID<SPACE>op<SPACE>content<EOF>,
+ * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete"
+ * for delete, or "u", "upd" or "update" for update.
+ */
+public class LineDocRecordReader implements
+    RecordReader<DocumentID, LineDocTextAndOp> {
+  private static final char SPACE = ' ';
+  private static final char EOL = '\n';
+
+  private long start;
+  private long pos;
+  private long end;
+  private BufferedInputStream in;
+  private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);
+
+  /**
+   * Provide a bridge to get the bytes from the ByteArrayOutputStream without
+   * creating a new byte array.
+   */
+  private static class TextStuffer extends OutputStream {
+    public Text target;
+
+    public void write(int b) {
+      throw new UnsupportedOperationException("write(byte) not supported");
+    }
+
+    public void write(byte[] data, int offset, int len) throws IOException {
+      target.set(data, offset, len);
+    }
+  }
+
+  private TextStuffer bridge = new TextStuffer();
+
+  /**
+   * Constructor
+   * @param job
+   * @param split  
+   * @throws IOException
+   */
+  public LineDocRecordReader(Configuration job, FileSplit split)
+      throws IOException {
+    long start = split.getStart();
+    long end = start + split.getLength();
+    final Path file = split.getPath();
+
+    // open the file and seek to the start of the split
+    FileSystem fs = file.getFileSystem(job);
+    FSDataInputStream fileIn = fs.open(split.getPath());
+    InputStream in = fileIn;
+    boolean skipFirstLine = false;
+    if (start != 0) {
+      skipFirstLine = true; // wait till BufferedInputStream to skip
+      --start;
+      fileIn.seek(start);
+    }
+
+    this.in = new BufferedInputStream(in);
+    if (skipFirstLine) { // skip first line and re-establish "start".
+      start += LineDocRecordReader.readData(this.in, null, EOL);
+    }
+    this.start = start;
+    this.pos = start;
+    this.end = end;
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.RecordReader#close()
+   */
+  public void close() throws IOException {
+    in.close();
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.RecordReader#createKey()
+   */
+  public DocumentID createKey() {
+    return new DocumentID();
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.RecordReader#createValue()
+   */
+  public LineDocTextAndOp createValue() {
+    return new LineDocTextAndOp();
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.RecordReader#getPos()
+   */
+  public long getPos() throws IOException {
+    return pos;
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.RecordReader#getProgress()
+   */
+  public float getProgress() throws IOException {
+    if (start == end) {
+      return 0.0f;
+    } else {
+      return Math.min(1.0f, (pos - start) / (float) (end - start));
+    }
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object)
+   */
+  public synchronized boolean next(DocumentID key, LineDocTextAndOp value)
+      throws IOException {
+    if (pos >= end) {
+      return false;
+    }
+
+    // key is document id, which are bytes until first space
+    if (!readInto(key.getText(), SPACE)) {
+      return false;
+    }
+
+    // read operation: i/d/u, or ins/del/upd, or insert/delete/update
+    Text opText = new Text();
+    if (!readInto(opText, SPACE)) {
+      return false;
+    }
+    String opStr = opText.toString();
+    DocumentAndOp.Op op;
+    if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) {
+      op = DocumentAndOp.Op.INSERT;
+    } else if (opStr.equals("d") || opStr.equals("del")
+        || opStr.equals("delete")) {
+      op = DocumentAndOp.Op.DELETE;
+    } else if (opStr.equals("u") || opStr.equals("upd")
+        || opStr.equals("update")) {
+      op = DocumentAndOp.Op.UPDATE;
+    } else {
+      // default is insert
+      op = DocumentAndOp.Op.INSERT;
+    }
+    value.setOp(op);
+
+    if (op == DocumentAndOp.Op.DELETE) {
+      return true;
+    } else {
+      // read rest of the line
+      return readInto(value.getText(), EOL);
+    }
+  }
+
+  private boolean readInto(Text text, char delimiter) throws IOException {
+    buffer.reset();
+    long bytesRead = readData(in, buffer, delimiter);
+    if (bytesRead == 0) {
+      return false;
+    }
+    pos += bytesRead;
+    bridge.target = text;
+    buffer.writeTo(bridge);
+    return true;
+  }
+
+  private static long readData(InputStream in, OutputStream out, char delimiter)
+      throws IOException {
+    long bytes = 0;
+    while (true) {
+
+      int b = in.read();
+      if (b == -1) {
+        break;
+      }
+      bytes += 1;
+
+      byte c = (byte) b;
+      if (c == EOL || c == delimiter) {
+        break;
+      }
+
+      if (c == '\r') {
+        in.mark(1);
+        byte nextC = (byte) in.read();
+        if (nextC != EOL || c == delimiter) {
+          in.reset();
+        } else {
+          bytes += 1;
+        }
+        break;
+      }
+
+      if (out != null) {
+        out.write(c);
+      }
+    }
+    return bytes;
+  }
+}

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java Mon Oct 22 20:43:16 2012
@@ -1,92 +1,92 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-
-import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-/**
- * This class represents an operation. The operation can be an insert, a delete
- * or an update. If the operation is an insert or an update, a (new) document,
- * which is in the form of text, is specified.
- */
-public class LineDocTextAndOp implements Writable {
-  private DocumentAndOp.Op op;
-  private Text doc;
-
-  /**
-   * Constructor
-   */
-  public LineDocTextAndOp() {
-    doc = new Text();
-  }
-
-  /**
-   * Set the type of the operation.
-   * @param op  the type of the operation
-   */
-  public void setOp(DocumentAndOp.Op op) {
-    this.op = op;
-  }
-
-  /**
-   * Get the type of the operation.
-   * @return the type of the operation
-   */
-  public DocumentAndOp.Op getOp() {
-    return op;
-  }
-
-  /**
-   * Get the text that represents a document.
-   * @return the text that represents a document
-   */
-  public Text getText() {
-    return doc;
-  }
-
-  /* (non-Javadoc)
-   * @see java.lang.Object#toString()
-   */
-  public String toString() {
-    return this.getClass().getName() + "[op=" + op + ", text=" + doc + "]";
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
-   */
-  public void write(DataOutput out) throws IOException {
-    throw new IOException(this.getClass().getName()
-        + ".write should never be called");
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
-   */
-  public void readFields(DataInput in) throws IOException {
-    throw new IOException(this.getClass().getName()
-        + ".readFields should never be called");
-  }
-
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents an operation. The operation can be an insert, a delete
+ * or an update. If the operation is an insert or an update, a (new) document,
+ * which is in the form of text, is specified.
+ */
+public class LineDocTextAndOp implements Writable {
+  private DocumentAndOp.Op op;
+  private Text doc;
+
+  /**
+   * Constructor
+   */
+  public LineDocTextAndOp() {
+    doc = new Text();
+  }
+
+  /**
+   * Set the type of the operation.
+   * @param op  the type of the operation
+   */
+  public void setOp(DocumentAndOp.Op op) {
+    this.op = op;
+  }
+
+  /**
+   * Get the type of the operation.
+   * @return the type of the operation
+   */
+  public DocumentAndOp.Op getOp() {
+    return op;
+  }
+
+  /**
+   * Get the text that represents a document.
+   * @return the text that represents a document
+   */
+  public Text getText() {
+    return doc;
+  }
+
+  /* (non-Javadoc)
+   * @see java.lang.Object#toString()
+   */
+  public String toString() {
+    return this.getClass().getName() + "[op=" + op + ", text=" + doc + "]";
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
+   */
+  public void write(DataOutput out) throws IOException {
+    throw new IOException(this.getClass().getName()
+        + ".write should never be called");
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
+   */
+  public void readFields(DataInput in) throws IOException {
+    throw new IOException(this.getClass().getName()
+        + ".readFields should never be called");
+  }
+
+}

Modified: hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java?rev=1401071&r1=1401070&r2=1401071&view=diff
==============================================================================
--- hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java (original)
+++ hadoop/common/branches/branch-trunk-win/hadoop-mapreduce-project/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java Mon Oct 22 20:43:16 2012
@@ -1,58 +1,58 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.hadoop.contrib.index.example;
-
-import org.apache.hadoop.contrib.index.mapred.DocumentID;
-import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy;
-import org.apache.hadoop.contrib.index.mapred.Shard;
-
-/**
- * Choose a shard for each insert in a round-robin fashion. Choose all the
- * shards for each delete because we don't know where it is stored.
- */
-public class RoundRobinDistributionPolicy implements IDistributionPolicy {
-
-  private int numShards;
-  private int rr; // round-robin implementation
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[])
-   */
-  public void init(Shard[] shards) {
-    numShards = shards.length;
-    rr = 0;
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID)
-   */
-  public int chooseShardForInsert(DocumentID key) {
-    int chosen = rr;
-    rr = (rr + 1) % numShards;
-    return chosen;
-  }
-
-  /* (non-Javadoc)
-   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID)
-   */
-  public int chooseShardForDelete(DocumentID key) {
-    // -1 represents all the shards
-    return -1;
-  }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy;
+import org.apache.hadoop.contrib.index.mapred.Shard;
+
+/**
+ * Choose a shard for each insert in a round-robin fashion. Choose all the
+ * shards for each delete because we don't know where it is stored.
+ */
+public class RoundRobinDistributionPolicy implements IDistributionPolicy {
+
+  private int numShards;
+  private int rr; // round-robin implementation
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[])
+   */
+  public void init(Shard[] shards) {
+    numShards = shards.length;
+    rr = 0;
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID)
+   */
+  public int chooseShardForInsert(DocumentID key) {
+    int chosen = rr;
+    rr = (rr + 1) % numShards;
+    return chosen;
+  }
+
+  /* (non-Javadoc)
+   * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID)
+   */
+  public int chooseShardForDelete(DocumentID key) {
+    // -1 represents all the shards
+    return -1;
+  }
+}