You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/01/06 03:46:23 UTC

svn commit: r896311 [4/4] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/classifier/ core/src/main/java/org/apache/mahout/classifier/bayes/algorithm/ core/src/main/java/org/apache/mahout/classifier/bayes/common/ core/src/main/java/org/...

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java Wed Jan  6 02:46:22 2010
@@ -17,6 +17,11 @@
 
 package org.apache.mahout.classifier.bayes;
 
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -41,58 +46,72 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.io.File;
-import java.util.HashSet;
-import java.util.Set;
-
 /**
  * Create and run the Wikipedia Dataset Creator.
  */
-public class WikipediaDatasetCreatorDriver {
-  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorDriver.class);
-
-  private WikipediaDatasetCreatorDriver() {
-  }
-
+public final class WikipediaDatasetCreatorDriver {
+  private static final Logger log = LoggerFactory
+      .getLogger(WikipediaDatasetCreatorDriver.class);
+  
+  private WikipediaDatasetCreatorDriver() { }
+  
   /**
    * Takes in two arguments:
    * <ol>
-   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+   * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents
+   * live</li>
    * <li>The output {@link org.apache.hadoop.fs.Path} where to write the
    * classifier as a {@link org.apache.hadoop.io.SequenceFile}</li>
    * </ol>
-   *
-   * @param args The args
+   * 
+   * @param args
+   *          The args
    */
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option dirInputPathOpt = obuilder.withLongName("input").withRequired(true).withArgument(
-            abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
-            withDescription("The input directory path").withShortName("i").create();
-
-    Option dirOutputPathOpt = obuilder.withLongName("output").withRequired(true).withArgument(
-            abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output directory Path").withShortName("o").create();
-
-    Option categoriesOpt = obuilder.withLongName("categories").withRequired(true).withArgument(
-            abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).
-            withDescription("Location of the categories file.  One entry per line.  Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
-
-    Option exactMatchOpt = obuilder.withLongName("exactMatch").
-            withDescription("If set, then the category name must exactly match the entry in the categories file. Default is false").withShortName("e").create();
-    Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false).withArgument(
-            abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).
-            withDescription("The analyzer to use, must have a no argument constructor").withShortName("a").create();
-    Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
-    Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt).withOption(dirOutputPathOpt)
-            .withOption(exactMatchOpt).withOption(analyzerOpt)
-            .withOption(helpOpt).create();
-
+    
+    Option dirInputPathOpt = obuilder.withLongName("input").withRequired(true)
+        .withArgument(
+          abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+        .withDescription("The input directory path").withShortName("i")
+        .create();
+    
+    Option dirOutputPathOpt = obuilder.withLongName("output")
+        .withRequired(true).withArgument(
+          abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+        .withDescription("The output directory Path").withShortName("o")
+        .create();
+    
+    Option categoriesOpt = obuilder
+        .withLongName("categories")
+        .withRequired(true)
+        .withArgument(
+          abuilder.withName("categories").withMinimum(1).withMaximum(1)
+              .create())
+        .withDescription(
+          "Location of the categories file.  One entry per line. "
+              + "Will be used to make a string match in Wikipedia Category field")
+        .withShortName("c").create();
+    
+    Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
+      "If set, then the category name must exactly match the "
+          + "entry in the categories file. Default is false")
+        .withShortName("e").create();
+    Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false)
+        .withArgument(
+          abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The analyzer to use, must have a no argument constructor")
+        .withShortName("a").create();
+    Option helpOpt = obuilder.withLongName("help").withDescription(
+      "Print out help").withShortName("h").create();
+    
+    Group group = gbuilder.withName("Options").withOption(categoriesOpt)
+        .withOption(dirInputPathOpt).withOption(dirOutputPathOpt).withOption(
+          exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt).create();
+    
     Parser parser = new Parser();
     parser.setGroup(group);
     try {
@@ -101,7 +120,7 @@
         CommandLineUtil.printHelp(group);
         return;
       }
-
+      
       String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
       String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
       String catFile = (String) cmdLine.getValue(categoriesOpt);
@@ -109,11 +128,12 @@
       if (cmdLine.hasOption(analyzerOpt)) {
         String className = cmdLine.getValue(analyzerOpt).toString();
         analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
-        //try instantiating it, b/c there isn't any point in setting it if
-        //you can't instantiate it
+        // try instantiating it, b/c there isn't any point in setting it if
+        // you can't instantiate it
         analyzerClass.newInstance();
       }
-      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
+      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt),
+        analyzerClass);
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
@@ -125,21 +145,30 @@
       log.error("Exception: Couldn't instantiate the class", e);
     }
   }
-
+  
   /**
    * Run the job
-   *
-   * @param input          the input pathname String
-   * @param output         the output pathname String
-   * @param catFile        the file containing the Wikipedia categories
-   * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply containing the category string
+   * 
+   * @param input
+   *          the input pathname String
+   * @param output
+   *          the output pathname String
+   * @param catFile
+   *          the file containing the Wikipedia categories
+   * @param exactMatchOnly
+   *          if true, then the Wikipedia category must match exactly instead of
+   *          simply containing the category string
    */
-  public static void runJob(String input, String output, String catFile,
-                            boolean exactMatchOnly, Class<? extends Analyzer> analyzerClass) throws IOException {
+  public static void runJob(String input,
+                            String output,
+                            String catFile,
+                            boolean exactMatchOnly,
+                            Class<? extends Analyzer> analyzerClass) throws IOException {
     JobClient client = new JobClient();
     JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.class);
     if (log.isInfoEnabled()) {
-      log.info("Input: " + input + " Out: " + output + " Categories: " + catFile);
+      log.info("Input: " + input + " Out: " + output + " Categories: "
+               + catFile);
     }
     conf.set("key.value.separator.in.input.line", " ");
     conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
@@ -154,29 +183,33 @@
     conf.setMapperClass(WikipediaDatasetCreatorMapper.class);
     conf.setNumMapTasks(100);
     conf.setInputFormat(XmlInputFormat.class);
-    //conf.setCombinerClass(WikipediaDatasetCreatorReducer.class);
+    // conf.setCombinerClass(WikipediaDatasetCreatorReducer.class);
     conf.setReducerClass(WikipediaDatasetCreatorReducer.class);
     conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.class);
-    conf.set("io.serializations",
-            "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
-    // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
-
+    conf
+        .set(
+          "io.serializations",
+          "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
+    // Dont ever forget this. People should keep track of how hadoop conf
+    // parameters and make or break a piece of code
+    
     FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
     if (dfs.exists(outPath)) {
       dfs.delete(outPath, true);
     }
-
+    
     Set<String> categories = new HashSet<String>();
     for (String line : new FileLineIterable(new File(catFile))) {
       categories.add(line.trim().toLowerCase());
     }
-
-    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));
-
+    
+    DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(
+        conf, GenericsUtil.getClass(categories));
+    
     String categoriesStr = setStringifier.toString(categories);
-
+    
     conf.set("wikipedia.categories", categoriesStr);
-
+    
     client.setConf(conf);
     JobClient.runJob(conf);
   }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Wed Jan  6 02:46:22 2010
@@ -17,10 +17,16 @@
 
 package org.apache.mahout.classifier.bayes;
 
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.mapred.MapReduceBase;
 import org.apache.hadoop.mapred.Mapper;
@@ -31,64 +37,72 @@
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.mahout.analysis.WikipediaAnalyzer;
-import org.slf4j.LoggerFactory;
 import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.regex.Pattern;
-
+/**
+ * Maps over Wikipedia xml format and output all document having the category
+ * listed in the input category file
+ * 
+ */
 public class WikipediaDatasetCreatorMapper extends MapReduceBase implements
-    Mapper<LongWritable, Text, Text, Text> {
-
-  private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
-  private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
-  private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
-  private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
-
-  private Set<String> inputCategories = null;
-  private boolean exactMatchOnly = false;
+    Mapper<LongWritable,Text,Text,Text> {
+  
+  private static final Logger log = LoggerFactory
+      .getLogger(WikipediaDatasetCreatorMapper.class);
+  private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern
+      .compile("[\\s\\W]");
+  private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern
+      .compile("<text xml:space=\"preserve\">");
+  private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern
+      .compile("</text>");
+  
+  private Set<String> inputCategories;
+  private boolean exactMatchOnly;
   private Analyzer analyzer;
-
+  
   @Override
-  public void map(LongWritable key, Text value,
-      OutputCollector<Text, Text> output, Reporter reporter)
-      throws IOException {
-
+  public void map(LongWritable key,
+                  Text value,
+                  OutputCollector<Text,Text> output,
+                  Reporter reporter) throws IOException {
+    
     StringBuilder contents = new StringBuilder();
     String document = value.toString();
     String catMatch = findMatchingCategory(document);
     
-    if(!catMatch.equals("Unknown")){
-      document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
-      TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
+    if (!catMatch.equals("Unknown")) {
+      document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(
+        OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
+          .replaceAll(""));
+      TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(
+          document));
       Token token = new Token();
-      while((token = stream.next(token)) != null){
+      while ((token = stream.next(token)) != null) {
         contents.append(token.termBuffer(), 0, token.termLength()).append(' ');
       }
-      output.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString()));
+      output.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
+          .replaceAll("_")), new Text(contents.toString()));
     }
   }
-
-  private String findMatchingCategory(String document){
+  
+  private String findMatchingCategory(String document) {
     int startIndex = 0;
     int categoryIndex;
-    while((categoryIndex = document.indexOf("[[Category:", startIndex))!=-1)
-    {
-      categoryIndex+=11;
+    while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+      categoryIndex += 11;
       int endIndex = document.indexOf("]]", categoryIndex);
       if (endIndex >= document.length() || endIndex < 0) {
         break;
       }
-      String category = document.substring(categoryIndex, endIndex).toLowerCase().trim();
-      //categories.add(category.toLowerCase());
-      if (exactMatchOnly && inputCategories.contains(category)){
+      String category = document.substring(categoryIndex, endIndex)
+          .toLowerCase().trim();
+      // categories.add(category.toLowerCase());
+      if (exactMatchOnly && inputCategories.contains(category)) {
         return category;
-      } else if (exactMatchOnly == false){
+      } else if (exactMatchOnly == false) {
         for (String inputCategory : inputCategories) {
-          if (category.contains(inputCategory)){//we have an inexact match
+          if (category.contains(inputCategory)) { // we have an inexact match
             return inputCategory;
           }
         }
@@ -101,24 +115,26 @@
   @Override
   public void configure(JobConf job) {
     try {
-      if (inputCategories == null){
+      if (inputCategories == null) {
         Set<String> newCategories = new HashSet<String>();
-
-        DefaultStringifier<Set<String>> setStringifier =
-            new DefaultStringifier<Set<String>>(job,GenericsUtil.getClass(newCategories));
-
+        
+        DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(
+            job, GenericsUtil.getClass(newCategories));
+        
         String categoriesStr = setStringifier.toString(newCategories);
         categoriesStr = job.get("wikipedia.categories", categoriesStr);
         inputCategories = setStringifier.fromString(categoriesStr);
-
+        
       }
       exactMatchOnly = job.getBoolean("exact.match.only", false);
-      if (analyzer == null){
-        String analyzerStr = job.get("analyzer.class", WikipediaAnalyzer.class.getName());
-        Class<? extends Analyzer> analyzerClass = (Class<? extends Analyzer>) Class.forName(analyzerStr);
+      if (analyzer == null) {
+        String analyzerStr = job.get("analyzer.class", WikipediaAnalyzer.class
+            .getName());
+        Class<? extends Analyzer> analyzerClass = (Class<? extends Analyzer>) Class
+            .forName(analyzerStr);
         analyzer = analyzerClass.newInstance();
       }
-    } catch(IOException ex){
+    } catch (IOException ex) {
       throw new IllegalStateException(ex);
     } catch (ClassNotFoundException e) {
       throw new IllegalStateException(e);
@@ -127,7 +143,8 @@
     } catch (InstantiationException e) {
       throw new IllegalStateException(e);
     }
-    log.info("Configure: Input Categories size: " + inputCategories.size() + " Exact Match: " + exactMatchOnly +
-             " Analyzer: " + analyzer.getClass().getName());
+    log.info("Configure: Input Categories size: " + inputCategories.size()
+             + " Exact Match: " + exactMatchOnly + " Analyzer: "
+             + analyzer.getClass().getName());
   }
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java Wed Jan  6 02:46:22 2010
@@ -17,26 +17,31 @@
 
 package org.apache.mahout.classifier.bayes;
 
+import java.io.IOException;
+import java.util.Iterator;
+
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.MapReduceBase;
 import org.apache.hadoop.mapred.OutputCollector;
 import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 
-import java.io.IOException;
-import java.util.Iterator;
-
 /**
- *  Can also be used as a local Combiner
+ * Can also be used as a local Combiner
  */
-public class WikipediaDatasetCreatorReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
+public class WikipediaDatasetCreatorReducer extends MapReduceBase implements
+    Reducer<Text,Text,Text,Text> {
   @Override
-  public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
-    //Key is label,word, value is the number of times we've seen this label word per local node.  Output is the same
-
+  public void reduce(Text key,
+                     Iterator<Text> values,
+                     OutputCollector<Text,Text> output,
+                     Reporter reporter) throws IOException {
+    // Key is label,word, value is the number of times we've seen this label
+    // word per local node. Output is the same
+    
     while (values.hasNext()) {
       output.collect(key, values.next());
     }
-   
+    
   }
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java Wed Jan  6 02:46:22 2010
@@ -17,95 +17,121 @@
 
 package org.apache.mahout.classifier.bayes;
 
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.mahout.common.FileLineIterator;
-
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
-import java.io.OutputStreamWriter;
 import java.io.IOException;
+import java.io.OutputStreamWriter;
 import java.text.DecimalFormat;
 import java.text.NumberFormat;
 
-public class WikipediaXmlSplitter {
-  private WikipediaXmlSplitter() {
-  }
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.FileLineIterator;
 
+/**
+ * Splits the wikipedia xml file in to chunks of size as specified by command
+ * line parameter
+ * 
+ */
+public final class WikipediaXmlSplitter {
+  private WikipediaXmlSplitter() { }
+  
   public static void main(String[] args) throws IOException, OptionException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-
-    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument(
-            abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).
-            withDescription("The path to the wikipedia dump file").withShortName("d").create();
-
-    Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(
-            abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).
-            withDescription("The output directory to place the splits in").withShortName("o").create();
-
-    Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument(
-            abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).
-            withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
-    Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false).withArgument(
-            abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()).
-            withDescription("The maximum number of chunks to create.  If specified, program will only create a subset of the chunks").withShortName("n").create();
-    Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(numChunksOpt).create();
-
+    
+    Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
+        .withArgument(
+          abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
+        .withDescription("The path to the wikipedia dump file").withShortName(
+          "d").create();
+    
+    Option outputDirOpt = obuilder
+        .withLongName("outputDir")
+        .withRequired(true)
+        .withArgument(
+          abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
+        .withDescription("The output directory to place the splits in")
+        .withShortName("o").create();
+    
+    Option chunkSizeOpt = obuilder
+        .withLongName("chunkSize")
+        .withRequired(true)
+        .withArgument(
+          abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
+        .withDescription("The Size of the chunk, in megabytes").withShortName(
+          "c").create();
+    Option numChunksOpt = obuilder
+        .withLongName("numChunks")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
+        .withDescription(
+          "The maximum number of chunks to create.  If specified, program will only create a subset of the chunks")
+        .withShortName("n").create();
+    Group group = gbuilder.withName("Options").withOption(dumpFileOpt)
+        .withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(
+          numChunksOpt).create();
+    
     Parser parser = new Parser();
     parser.setGroup(group);
     CommandLine cmdLine = parser.parse(args);
-
+    
     String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
     String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
-
-    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
-
+    
+    int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine
+        .getValue(chunkSizeOpt));
+    
     int numChunks = Integer.MAX_VALUE;
-    if (cmdLine.hasOption(numChunksOpt)){
+    if (cmdLine.hasOption(numChunksOpt)) {
       numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
     }
-
+    
     File dir = new File(outputDirPath);
     dir.mkdirs();
-    String header =
-          "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd\" version=\"0.3\" xml:lang=\"en\">\n"
-        + "  <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
-        + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
-        + "    <generator>MediaWiki 1.13alpha</generator>\n"
-        + "    <case>first-letter</case>\n"
-        + "    <namespaces>\n"
-        + "      <namespace key=\"-2\">Media</namespace>\n"
-        + "      <namespace key=\"-1\">Special</namespace>\n"
-        + "      <namespace key=\"0\" />\n"
-        + "      <namespace key=\"1\">Talk</namespace>\n"
-        + "      <namespace key=\"2\">User</namespace>\n"
-        + "      <namespace key=\"3\">User talk</namespace>\n"
-        + "      <namespace key=\"4\">Wikipedia</namespace>\n"
-        + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
-        + "      <namespace key=\"6\">Image</namespace>\n"
-        + "      <namespace key=\"7\">Image talk</namespace>\n"
-        + "      <namespace key=\"8\">MediaWiki</namespace>\n"
-        + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
-        + "      <namespace key=\"10\">Template</namespace>\n"
-        + "      <namespace key=\"11\">Template talk</namespace>\n"
-        + "      <namespace key=\"12\">Help</namespace>\n"
-        + "      <namespace key=\"13\">Help talk</namespace>\n"
-        + "      <namespace key=\"14\">Category</namespace>\n"
-        + "      <namespace key=\"15\">Category talk</namespace>\n"
-        + "      <namespace key=\"100\">Portal</namespace>\n"
-        + "      <namespace key=\"101\">Portal talk</namespace>\n"
-        + "    </namespaces>\n"
-        + "  </siteinfo>\n";
-
+    String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
+                    + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+                    + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
+                    + "http://www.mediawiki.org/xml/export-0.3.xsd\" "
+                    + "version=\"0.3\" "
+                    + "xml:lang=\"en\">\n"
+                    + "  <siteinfo>\n"
+                    + "<sitename>Wikipedia</sitename>\n"
+                    + "    <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
+                    + "    <generator>MediaWiki 1.13alpha</generator>\n"
+                    + "    <case>first-letter</case>\n"
+                    + "    <namespaces>\n"
+                    + "      <namespace key=\"-2\">Media</namespace>\n"
+                    + "      <namespace key=\"-1\">Special</namespace>\n"
+                    + "      <namespace key=\"0\" />\n"
+                    + "      <namespace key=\"1\">Talk</namespace>\n"
+                    + "      <namespace key=\"2\">User</namespace>\n"
+                    + "      <namespace key=\"3\">User talk</namespace>\n"
+                    + "      <namespace key=\"4\">Wikipedia</namespace>\n"
+                    + "      <namespace key=\"5\">Wikipedia talk</namespace>\n"
+                    + "      <namespace key=\"6\">Image</namespace>\n"
+                    + "      <namespace key=\"7\">Image talk</namespace>\n"
+                    + "      <namespace key=\"8\">MediaWiki</namespace>\n"
+                    + "      <namespace key=\"9\">MediaWiki talk</namespace>\n"
+                    + "      <namespace key=\"10\">Template</namespace>\n"
+                    + "      <namespace key=\"11\">Template talk</namespace>\n"
+                    + "      <namespace key=\"12\">Help</namespace>\n"
+                    + "      <namespace key=\"13\">Help talk</namespace>\n"
+                    + "      <namespace key=\"14\">Category</namespace>\n"
+                    + "      <namespace key=\"15\">Category talk</namespace>\n"
+                    + "      <namespace key=\"100\">Portal</namespace>\n"
+                    + "      <namespace key=\"101\">Portal talk</namespace>\n"
+                    + "    </namespaces>\n" + "  </siteinfo>\n";
+    
     StringBuilder content = new StringBuilder();
     content.append(header);
     int filenumber = 0;
@@ -113,9 +139,9 @@
     FileLineIterator it = new FileLineIterator(new File(dumpFilePath));
     while (it.hasNext()) {
       String thisLine = it.next();
-      if(thisLine.trim().startsWith("<page>")){
+      if (thisLine.trim().startsWith("<page>")) {
         boolean end = false;
-        while(thisLine.trim().startsWith("</page>") == false){
+        while (thisLine.trim().startsWith("</page>") == false) {
           content.append(thisLine).append('\n');
           if (it.hasNext()) {
             thisLine = it.next();
@@ -125,17 +151,19 @@
           }
         }
         content.append(thisLine).append('\n');
-
-        if(content.length()>chunkSize || end){
+        
+        if (content.length() > chunkSize || end) {
           content.append("</mediawiki>");
           filenumber++;
-
-          BufferedWriter chunkWriter = new BufferedWriter(new OutputStreamWriter(
-              new FileOutputStream(dir.getPath()+"/chunk-"+ decimalFormatter.format(filenumber)+".xml"), "UTF-8"));
-
+          
+          BufferedWriter chunkWriter = new BufferedWriter(
+              new OutputStreamWriter(new FileOutputStream(
+                  dir.getPath() + "/chunk-"
+                      + decimalFormatter.format(filenumber) + ".xml"), "UTF-8"));
+          
           chunkWriter.write(content.toString(), 0, content.length());
           chunkWriter.close();
-          if (filenumber >= numChunks){
+          if (filenumber >= numChunks) {
             break;
           }
           content = new StringBuilder();
@@ -143,6 +171,6 @@
         }
       }
     }
-
+    
   }
 }

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java Wed Jan  6 02:46:22 2010
@@ -17,12 +17,14 @@
 
 package org.apache.mahout.classifier.bayes;
 
+import java.io.IOException;
+
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.FileSplit;
 import org.apache.hadoop.mapred.InputSplit;
 import org.apache.hadoop.mapred.JobConf;
@@ -30,34 +32,39 @@
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.hadoop.mapred.TextInputFormat;
 
-import java.io.IOException;
-
 /**
  * Reads records that are delimited by a specifc begin/end tag.
  */
 public class XmlInputFormat extends TextInputFormat {
-
+  
   public static final String START_TAG_KEY = "xmlinput.start";
   public static final String END_TAG_KEY = "xmlinput.end";
-
+  
   @Override
-  public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter)
-      throws IOException {
+  public RecordReader<LongWritable,Text> getRecordReader(InputSplit inputSplit,
+                                                         JobConf jobConf,
+                                                         Reporter reporter) throws IOException {
     return new XmlRecordReader((FileSplit) inputSplit, jobConf);
   }
-
-  public static class XmlRecordReader implements RecordReader<LongWritable, Text> {
+  
+  /**
+   * XMLRecordReader class to read through a given xml document to output xml
+   * blocks as records as specified by the start tag and end tag
+   * 
+   */
+  public static class XmlRecordReader implements
+      RecordReader<LongWritable,Text> {
     private final byte[] startTag;
     private final byte[] endTag;
     private final long start;
     private final long end;
     private final FSDataInputStream fsin;
     private final DataOutputBuffer buffer = new DataOutputBuffer();
-
+    
     public XmlRecordReader(FileSplit split, JobConf jobConf) throws IOException {
       startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
       endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");
-
+      
       // open the file and seek to the start of the split
       start = split.getStart();
       end = start + split.getLength();
@@ -66,7 +73,7 @@
       fsin = fs.open(split.getPath());
       fsin.seek(start);
     }
-
+    
     @Override
     public boolean next(LongWritable key, Text value) throws IOException {
       if (fsin.getPos() < end) {
@@ -85,32 +92,32 @@
       }
       return false;
     }
-
+    
     @Override
     public LongWritable createKey() {
       return new LongWritable();
     }
-
+    
     @Override
     public Text createValue() {
       return new Text();
     }
-
+    
     @Override
     public long getPos() throws IOException {
       return fsin.getPos();
     }
-
+    
     @Override
     public void close() throws IOException {
       fsin.close();
     }
-
+    
     @Override
     public float getProgress() throws IOException {
-      return ((fsin.getPos() - start) / (float) (end - start));
+      return (fsin.getPos() - start) / (float) (end - start);
     }
-
+    
     private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
       int i = 0;
       while (true) {
@@ -119,7 +126,7 @@
         if (b == -1) return false;
         // save to buffer:
         if (withinBlock) buffer.write(b);
-
+        
         // check if we're matching:
         if (b == match[i]) {
           i++;
@@ -130,4 +137,4 @@
       }
     }
   }
-} 
\ No newline at end of file
+}
\ No newline at end of file