You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2010/01/06 03:46:23 UTC
svn commit: r896311 [4/4] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/classifier/
core/src/main/java/org/apache/mahout/classifier/bayes/algorithm/
core/src/main/java/org/apache/mahout/classifier/bayes/common/
core/src/main/java/org/...
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java Wed Jan 6 02:46:22 2010
@@ -17,6 +17,11 @@
package org.apache.mahout.classifier.bayes;
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -41,58 +46,72 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.io.File;
-import java.util.HashSet;
-import java.util.Set;
-
/**
* Create and run the Wikipedia Dataset Creator.
*/
-public class WikipediaDatasetCreatorDriver {
- private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorDriver.class);
-
- private WikipediaDatasetCreatorDriver() {
- }
-
+public final class WikipediaDatasetCreatorDriver {
+ private static final Logger log = LoggerFactory
+ .getLogger(WikipediaDatasetCreatorDriver.class);
+
+ private WikipediaDatasetCreatorDriver() { }
+
/**
* Takes in two arguments:
* <ol>
- * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents live</li>
+ * <li>The input {@link org.apache.hadoop.fs.Path} where the input documents
+ * live</li>
* <li>The output {@link org.apache.hadoop.fs.Path} where to write the
* classifier as a {@link org.apache.hadoop.io.SequenceFile}</li>
* </ol>
- *
- * @param args The args
+ *
+ * @param args
+ * The args
*/
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
- Option dirInputPathOpt = obuilder.withLongName("input").withRequired(true).withArgument(
- abuilder.withName("input").withMinimum(1).withMaximum(1).create()).
- withDescription("The input directory path").withShortName("i").create();
-
- Option dirOutputPathOpt = obuilder.withLongName("output").withRequired(true).withArgument(
- abuilder.withName("output").withMinimum(1).withMaximum(1).create()).
- withDescription("The output directory Path").withShortName("o").create();
-
- Option categoriesOpt = obuilder.withLongName("categories").withRequired(true).withArgument(
- abuilder.withName("categories").withMinimum(1).withMaximum(1).create()).
- withDescription("Location of the categories file. One entry per line. Will be used to make a string match in Wikipedia Category field").withShortName("c").create();
-
- Option exactMatchOpt = obuilder.withLongName("exactMatch").
- withDescription("If set, then the category name must exactly match the entry in the categories file. Default is false").withShortName("e").create();
- Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false).withArgument(
- abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create()).
- withDescription("The analyzer to use, must have a no argument constructor").withShortName("a").create();
- Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h").create();
-
- Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt).withOption(dirOutputPathOpt)
- .withOption(exactMatchOpt).withOption(analyzerOpt)
- .withOption(helpOpt).create();
-
+
+ Option dirInputPathOpt = obuilder.withLongName("input").withRequired(true)
+ .withArgument(
+ abuilder.withName("input").withMinimum(1).withMaximum(1).create())
+ .withDescription("The input directory path").withShortName("i")
+ .create();
+
+ Option dirOutputPathOpt = obuilder.withLongName("output")
+ .withRequired(true).withArgument(
+ abuilder.withName("output").withMinimum(1).withMaximum(1).create())
+ .withDescription("The output directory Path").withShortName("o")
+ .create();
+
+ Option categoriesOpt = obuilder
+ .withLongName("categories")
+ .withRequired(true)
+ .withArgument(
+ abuilder.withName("categories").withMinimum(1).withMaximum(1)
+ .create())
+ .withDescription(
+ "Location of the categories file. One entry per line. "
+ + "Will be used to make a string match in Wikipedia Category field")
+ .withShortName("c").create();
+
+ Option exactMatchOpt = obuilder.withLongName("exactMatch").withDescription(
+ "If set, then the category name must exactly match the "
+ + "entry in the categories file. Default is false")
+ .withShortName("e").create();
+ Option analyzerOpt = obuilder.withLongName("analyzer").withRequired(false)
+ .withArgument(
+ abuilder.withName("analyzer").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The analyzer to use, must have a no argument constructor")
+ .withShortName("a").create();
+ Option helpOpt = obuilder.withLongName("help").withDescription(
+ "Print out help").withShortName("h").create();
+
+ Group group = gbuilder.withName("Options").withOption(categoriesOpt)
+ .withOption(dirInputPathOpt).withOption(dirOutputPathOpt).withOption(
+ exactMatchOpt).withOption(analyzerOpt).withOption(helpOpt).create();
+
Parser parser = new Parser();
parser.setGroup(group);
try {
@@ -101,7 +120,7 @@
CommandLineUtil.printHelp(group);
return;
}
-
+
String inputPath = (String) cmdLine.getValue(dirInputPathOpt);
String outputPath = (String) cmdLine.getValue(dirOutputPathOpt);
String catFile = (String) cmdLine.getValue(categoriesOpt);
@@ -109,11 +128,12 @@
if (cmdLine.hasOption(analyzerOpt)) {
String className = cmdLine.getValue(analyzerOpt).toString();
analyzerClass = (Class<? extends Analyzer>) Class.forName(className);
- //try instantiating it, b/c there isn't any point in setting it if
- //you can't instantiate it
+ // try instantiating it, b/c there isn't any point in setting it if
+ // you can't instantiate it
analyzerClass.newInstance();
}
- runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), analyzerClass);
+ runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt),
+ analyzerClass);
} catch (OptionException e) {
log.error("Exception", e);
CommandLineUtil.printHelp(group);
@@ -125,21 +145,30 @@
log.error("Exception: Couldn't instantiate the class", e);
}
}
-
+
/**
* Run the job
- *
- * @param input the input pathname String
- * @param output the output pathname String
- * @param catFile the file containing the Wikipedia categories
- * @param exactMatchOnly if true, then the Wikipedia category must match exactly instead of simply containing the category string
+ *
+ * @param input
+ * the input pathname String
+ * @param output
+ * the output pathname String
+ * @param catFile
+ * the file containing the Wikipedia categories
+ * @param exactMatchOnly
+ * if true, then the Wikipedia category must match exactly instead of
+ * simply containing the category string
*/
- public static void runJob(String input, String output, String catFile,
- boolean exactMatchOnly, Class<? extends Analyzer> analyzerClass) throws IOException {
+ public static void runJob(String input,
+ String output,
+ String catFile,
+ boolean exactMatchOnly,
+ Class<? extends Analyzer> analyzerClass) throws IOException {
JobClient client = new JobClient();
JobConf conf = new JobConf(WikipediaDatasetCreatorDriver.class);
if (log.isInfoEnabled()) {
- log.info("Input: " + input + " Out: " + output + " Categories: " + catFile);
+ log.info("Input: " + input + " Out: " + output + " Categories: "
+ + catFile);
}
conf.set("key.value.separator.in.input.line", " ");
conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
@@ -154,29 +183,33 @@
conf.setMapperClass(WikipediaDatasetCreatorMapper.class);
conf.setNumMapTasks(100);
conf.setInputFormat(XmlInputFormat.class);
- //conf.setCombinerClass(WikipediaDatasetCreatorReducer.class);
+ // conf.setCombinerClass(WikipediaDatasetCreatorReducer.class);
conf.setReducerClass(WikipediaDatasetCreatorReducer.class);
conf.setOutputFormat(WikipediaDatasetCreatorOutputFormat.class);
- conf.set("io.serializations",
- "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
- // Dont ever forget this. People should keep track of how hadoop conf parameters and make or break a piece of code
-
+ conf
+ .set(
+ "io.serializations",
+ "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization");
+ // Dont ever forget this. People should keep track of how hadoop conf
+ // parameters and make or break a piece of code
+
FileSystem dfs = FileSystem.get(outPath.toUri(), conf);
if (dfs.exists(outPath)) {
dfs.delete(outPath, true);
}
-
+
Set<String> categories = new HashSet<String>();
for (String line : new FileLineIterable(new File(catFile))) {
categories.add(line.trim().toLowerCase());
}
-
- DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));
-
+
+ DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(
+ conf, GenericsUtil.getClass(categories));
+
String categoriesStr = setStringifier.toString(categories);
-
+
conf.set("wikipedia.categories", categoriesStr);
-
+
client.setConf(conf);
JobClient.runJob(conf);
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java Wed Jan 6 02:46:22 2010
@@ -17,10 +17,16 @@
package org.apache.mahout.classifier.bayes;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.hadoop.io.DefaultStringifier;
-import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
@@ -31,64 +37,72 @@
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.mahout.analysis.WikipediaAnalyzer;
-import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.regex.Pattern;
-
+/**
+ * Maps over Wikipedia xml format and output all document having the category
+ * listed in the input category file
+ *
+ */
public class WikipediaDatasetCreatorMapper extends MapReduceBase implements
- Mapper<LongWritable, Text, Text, Text> {
-
- private static final Logger log = LoggerFactory.getLogger(WikipediaDatasetCreatorMapper.class);
- private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s\\W]");
- private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern.compile("<text xml:space=\"preserve\">");
- private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern.compile("</text>");
-
- private Set<String> inputCategories = null;
- private boolean exactMatchOnly = false;
+ Mapper<LongWritable,Text,Text,Text> {
+
+ private static final Logger log = LoggerFactory
+ .getLogger(WikipediaDatasetCreatorMapper.class);
+ private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern
+ .compile("[\\s\\W]");
+ private static final Pattern OPEN_TEXT_TAG_PATTERN = Pattern
+ .compile("<text xml:space=\"preserve\">");
+ private static final Pattern CLOSE_TEXT_TAG_PATTERN = Pattern
+ .compile("</text>");
+
+ private Set<String> inputCategories;
+ private boolean exactMatchOnly;
private Analyzer analyzer;
-
+
@Override
- public void map(LongWritable key, Text value,
- OutputCollector<Text, Text> output, Reporter reporter)
- throws IOException {
-
+ public void map(LongWritable key,
+ Text value,
+ OutputCollector<Text,Text> output,
+ Reporter reporter) throws IOException {
+
StringBuilder contents = new StringBuilder();
String document = value.toString();
String catMatch = findMatchingCategory(document);
- if(!catMatch.equals("Unknown")){
- document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
- TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
+ if (!catMatch.equals("Unknown")) {
+ document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(
+ OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
+ .replaceAll(""));
+ TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(
+ document));
Token token = new Token();
- while((token = stream.next(token)) != null){
+ while ((token = stream.next(token)) != null) {
contents.append(token.termBuffer(), 0, token.termLength()).append(' ');
}
- output.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch).replaceAll("_")), new Text(contents.toString()));
+ output.collect(new Text(SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
+ .replaceAll("_")), new Text(contents.toString()));
}
}
-
- private String findMatchingCategory(String document){
+
+ private String findMatchingCategory(String document) {
int startIndex = 0;
int categoryIndex;
- while((categoryIndex = document.indexOf("[[Category:", startIndex))!=-1)
- {
- categoryIndex+=11;
+ while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+ categoryIndex += 11;
int endIndex = document.indexOf("]]", categoryIndex);
if (endIndex >= document.length() || endIndex < 0) {
break;
}
- String category = document.substring(categoryIndex, endIndex).toLowerCase().trim();
- //categories.add(category.toLowerCase());
- if (exactMatchOnly && inputCategories.contains(category)){
+ String category = document.substring(categoryIndex, endIndex)
+ .toLowerCase().trim();
+ // categories.add(category.toLowerCase());
+ if (exactMatchOnly && inputCategories.contains(category)) {
return category;
- } else if (exactMatchOnly == false){
+ } else if (exactMatchOnly == false) {
for (String inputCategory : inputCategories) {
- if (category.contains(inputCategory)){//we have an inexact match
+ if (category.contains(inputCategory)) { // we have an inexact match
return inputCategory;
}
}
@@ -101,24 +115,26 @@
@Override
public void configure(JobConf job) {
try {
- if (inputCategories == null){
+ if (inputCategories == null) {
Set<String> newCategories = new HashSet<String>();
-
- DefaultStringifier<Set<String>> setStringifier =
- new DefaultStringifier<Set<String>>(job,GenericsUtil.getClass(newCategories));
-
+
+ DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(
+ job, GenericsUtil.getClass(newCategories));
+
String categoriesStr = setStringifier.toString(newCategories);
categoriesStr = job.get("wikipedia.categories", categoriesStr);
inputCategories = setStringifier.fromString(categoriesStr);
-
+
}
exactMatchOnly = job.getBoolean("exact.match.only", false);
- if (analyzer == null){
- String analyzerStr = job.get("analyzer.class", WikipediaAnalyzer.class.getName());
- Class<? extends Analyzer> analyzerClass = (Class<? extends Analyzer>) Class.forName(analyzerStr);
+ if (analyzer == null) {
+ String analyzerStr = job.get("analyzer.class", WikipediaAnalyzer.class
+ .getName());
+ Class<? extends Analyzer> analyzerClass = (Class<? extends Analyzer>) Class
+ .forName(analyzerStr);
analyzer = analyzerClass.newInstance();
}
- } catch(IOException ex){
+ } catch (IOException ex) {
throw new IllegalStateException(ex);
} catch (ClassNotFoundException e) {
throw new IllegalStateException(e);
@@ -127,7 +143,8 @@
} catch (InstantiationException e) {
throw new IllegalStateException(e);
}
- log.info("Configure: Input Categories size: " + inputCategories.size() + " Exact Match: " + exactMatchOnly +
- " Analyzer: " + analyzer.getClass().getName());
+ log.info("Configure: Input Categories size: " + inputCategories.size()
+ + " Exact Match: " + exactMatchOnly + " Analyzer: "
+ + analyzer.getClass().getName());
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorReducer.java Wed Jan 6 02:46:22 2010
@@ -17,26 +17,31 @@
package org.apache.mahout.classifier.bayes;
+import java.io.IOException;
+import java.util.Iterator;
+
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
-import java.io.IOException;
-import java.util.Iterator;
-
/**
- * Can also be used as a local Combiner
+ * Can also be used as a local Combiner
*/
-public class WikipediaDatasetCreatorReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> {
+public class WikipediaDatasetCreatorReducer extends MapReduceBase implements
+ Reducer<Text,Text,Text,Text> {
@Override
- public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException {
- //Key is label,word, value is the number of times we've seen this label word per local node. Output is the same
-
+ public void reduce(Text key,
+ Iterator<Text> values,
+ OutputCollector<Text,Text> output,
+ Reporter reporter) throws IOException {
+ // Key is label,word, value is the number of times we've seen this label
+ // word per local node. Output is the same
+
while (values.hasNext()) {
output.collect(key, values.next());
}
-
+
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java Wed Jan 6 02:46:22 2010
@@ -17,95 +17,121 @@
package org.apache.mahout.classifier.bayes;
-import org.apache.commons.cli2.Option;
-import org.apache.commons.cli2.CommandLine;
-import org.apache.commons.cli2.Group;
-import org.apache.commons.cli2.OptionException;
-import org.apache.commons.cli2.commandline.Parser;
-import org.apache.commons.cli2.builder.DefaultOptionBuilder;
-import org.apache.commons.cli2.builder.ArgumentBuilder;
-import org.apache.commons.cli2.builder.GroupBuilder;
-import org.apache.mahout.common.FileLineIterator;
-
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
-import java.io.OutputStreamWriter;
import java.io.IOException;
+import java.io.OutputStreamWriter;
import java.text.DecimalFormat;
import java.text.NumberFormat;
-public class WikipediaXmlSplitter {
- private WikipediaXmlSplitter() {
- }
+import org.apache.commons.cli2.CommandLine;
+import org.apache.commons.cli2.Group;
+import org.apache.commons.cli2.Option;
+import org.apache.commons.cli2.OptionException;
+import org.apache.commons.cli2.builder.ArgumentBuilder;
+import org.apache.commons.cli2.builder.DefaultOptionBuilder;
+import org.apache.commons.cli2.builder.GroupBuilder;
+import org.apache.commons.cli2.commandline.Parser;
+import org.apache.mahout.common.FileLineIterator;
+/**
+ * Splits the wikipedia xml file in to chunks of size as specified by command
+ * line parameter
+ *
+ */
+public final class WikipediaXmlSplitter {
+ private WikipediaXmlSplitter() { }
+
public static void main(String[] args) throws IOException, OptionException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
-
- Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true).withArgument(
- abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create()).
- withDescription("The path to the wikipedia dump file").withShortName("d").create();
-
- Option outputDirOpt = obuilder.withLongName("outputDir").withRequired(true).withArgument(
- abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create()).
- withDescription("The output directory to place the splits in").withShortName("o").create();
-
- Option chunkSizeOpt = obuilder.withLongName("chunkSize").withRequired(true).withArgument(
- abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).
- withDescription("The Size of the chunk, in megabytes").withShortName("c").create();
- Option numChunksOpt = obuilder.withLongName("numChunks").withRequired(false).withArgument(
- abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create()).
- withDescription("The maximum number of chunks to create. If specified, program will only create a subset of the chunks").withShortName("n").create();
- Group group = gbuilder.withName("Options").withOption(dumpFileOpt).withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(numChunksOpt).create();
-
+
+ Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
+ .withArgument(
+ abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
+ .withDescription("The path to the wikipedia dump file").withShortName(
+ "d").create();
+
+ Option outputDirOpt = obuilder
+ .withLongName("outputDir")
+ .withRequired(true)
+ .withArgument(
+ abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
+ .withDescription("The output directory to place the splits in")
+ .withShortName("o").create();
+
+ Option chunkSizeOpt = obuilder
+ .withLongName("chunkSize")
+ .withRequired(true)
+ .withArgument(
+ abuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create())
+ .withDescription("The Size of the chunk, in megabytes").withShortName(
+ "c").create();
+ Option numChunksOpt = obuilder
+ .withLongName("numChunks")
+ .withRequired(false)
+ .withArgument(
+ abuilder.withName("numChunks").withMinimum(1).withMaximum(1).create())
+ .withDescription(
+ "The maximum number of chunks to create. If specified, program will only create a subset of the chunks")
+ .withShortName("n").create();
+ Group group = gbuilder.withName("Options").withOption(dumpFileOpt)
+ .withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(
+ numChunksOpt).create();
+
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
-
- int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine.getValue(chunkSizeOpt));
-
+
+ int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine
+ .getValue(chunkSizeOpt));
+
int numChunks = Integer.MAX_VALUE;
- if (cmdLine.hasOption(numChunksOpt)){
+ if (cmdLine.hasOption(numChunksOpt)) {
numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
}
-
+
File dir = new File(outputDirPath);
dir.mkdirs();
- String header =
- "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ http://www.mediawiki.org/xml/export-0.3.xsd\" version=\"0.3\" xml:lang=\"en\">\n"
- + " <siteinfo>\n" + "<sitename>Wikipedia</sitename>\n"
- + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
- + " <generator>MediaWiki 1.13alpha</generator>\n"
- + " <case>first-letter</case>\n"
- + " <namespaces>\n"
- + " <namespace key=\"-2\">Media</namespace>\n"
- + " <namespace key=\"-1\">Special</namespace>\n"
- + " <namespace key=\"0\" />\n"
- + " <namespace key=\"1\">Talk</namespace>\n"
- + " <namespace key=\"2\">User</namespace>\n"
- + " <namespace key=\"3\">User talk</namespace>\n"
- + " <namespace key=\"4\">Wikipedia</namespace>\n"
- + " <namespace key=\"5\">Wikipedia talk</namespace>\n"
- + " <namespace key=\"6\">Image</namespace>\n"
- + " <namespace key=\"7\">Image talk</namespace>\n"
- + " <namespace key=\"8\">MediaWiki</namespace>\n"
- + " <namespace key=\"9\">MediaWiki talk</namespace>\n"
- + " <namespace key=\"10\">Template</namespace>\n"
- + " <namespace key=\"11\">Template talk</namespace>\n"
- + " <namespace key=\"12\">Help</namespace>\n"
- + " <namespace key=\"13\">Help talk</namespace>\n"
- + " <namespace key=\"14\">Category</namespace>\n"
- + " <namespace key=\"15\">Category talk</namespace>\n"
- + " <namespace key=\"100\">Portal</namespace>\n"
- + " <namespace key=\"101\">Portal talk</namespace>\n"
- + " </namespaces>\n"
- + " </siteinfo>\n";
-
+ String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
+ + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+ + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
+ + "http://www.mediawiki.org/xml/export-0.3.xsd\" "
+ + "version=\"0.3\" "
+ + "xml:lang=\"en\">\n"
+ + " <siteinfo>\n"
+ + "<sitename>Wikipedia</sitename>\n"
+ + " <base>http://en.wikipedia.org/wiki/Main_Page</base>\n"
+ + " <generator>MediaWiki 1.13alpha</generator>\n"
+ + " <case>first-letter</case>\n"
+ + " <namespaces>\n"
+ + " <namespace key=\"-2\">Media</namespace>\n"
+ + " <namespace key=\"-1\">Special</namespace>\n"
+ + " <namespace key=\"0\" />\n"
+ + " <namespace key=\"1\">Talk</namespace>\n"
+ + " <namespace key=\"2\">User</namespace>\n"
+ + " <namespace key=\"3\">User talk</namespace>\n"
+ + " <namespace key=\"4\">Wikipedia</namespace>\n"
+ + " <namespace key=\"5\">Wikipedia talk</namespace>\n"
+ + " <namespace key=\"6\">Image</namespace>\n"
+ + " <namespace key=\"7\">Image talk</namespace>\n"
+ + " <namespace key=\"8\">MediaWiki</namespace>\n"
+ + " <namespace key=\"9\">MediaWiki talk</namespace>\n"
+ + " <namespace key=\"10\">Template</namespace>\n"
+ + " <namespace key=\"11\">Template talk</namespace>\n"
+ + " <namespace key=\"12\">Help</namespace>\n"
+ + " <namespace key=\"13\">Help talk</namespace>\n"
+ + " <namespace key=\"14\">Category</namespace>\n"
+ + " <namespace key=\"15\">Category talk</namespace>\n"
+ + " <namespace key=\"100\">Portal</namespace>\n"
+ + " <namespace key=\"101\">Portal talk</namespace>\n"
+ + " </namespaces>\n" + " </siteinfo>\n";
+
StringBuilder content = new StringBuilder();
content.append(header);
int filenumber = 0;
@@ -113,9 +139,9 @@
FileLineIterator it = new FileLineIterator(new File(dumpFilePath));
while (it.hasNext()) {
String thisLine = it.next();
- if(thisLine.trim().startsWith("<page>")){
+ if (thisLine.trim().startsWith("<page>")) {
boolean end = false;
- while(thisLine.trim().startsWith("</page>") == false){
+ while (thisLine.trim().startsWith("</page>") == false) {
content.append(thisLine).append('\n');
if (it.hasNext()) {
thisLine = it.next();
@@ -125,17 +151,19 @@
}
}
content.append(thisLine).append('\n');
-
- if(content.length()>chunkSize || end){
+
+ if (content.length() > chunkSize || end) {
content.append("</mediawiki>");
filenumber++;
-
- BufferedWriter chunkWriter = new BufferedWriter(new OutputStreamWriter(
- new FileOutputStream(dir.getPath()+"/chunk-"+ decimalFormatter.format(filenumber)+".xml"), "UTF-8"));
-
+
+ BufferedWriter chunkWriter = new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(
+ dir.getPath() + "/chunk-"
+ + decimalFormatter.format(filenumber) + ".xml"), "UTF-8"));
+
chunkWriter.write(content.toString(), 0, content.length());
chunkWriter.close();
- if (filenumber >= numChunks){
+ if (filenumber >= numChunks) {
break;
}
content = new StringBuilder();
@@ -143,6 +171,6 @@
}
}
}
-
+
}
}
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java?rev=896311&r1=896310&r2=896311&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/XmlInputFormat.java Wed Jan 6 02:46:22 2010
@@ -17,12 +17,14 @@
package org.apache.mahout.classifier.bayes;
+import java.io.IOException;
+
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
-import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
@@ -30,34 +32,39 @@
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
-import java.io.IOException;
-
/**
* Reads records that are delimited by a specifc begin/end tag.
*/
public class XmlInputFormat extends TextInputFormat {
-
+
public static final String START_TAG_KEY = "xmlinput.start";
public static final String END_TAG_KEY = "xmlinput.end";
-
+
@Override
- public RecordReader<LongWritable, Text> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter)
- throws IOException {
+ public RecordReader<LongWritable,Text> getRecordReader(InputSplit inputSplit,
+ JobConf jobConf,
+ Reporter reporter) throws IOException {
return new XmlRecordReader((FileSplit) inputSplit, jobConf);
}
-
- public static class XmlRecordReader implements RecordReader<LongWritable, Text> {
+
+ /**
+ * XMLRecordReader class to read through a given xml document to output xml
+ * blocks as records as specified by the start tag and end tag
+ *
+ */
+ public static class XmlRecordReader implements
+ RecordReader<LongWritable,Text> {
private final byte[] startTag;
private final byte[] endTag;
private final long start;
private final long end;
private final FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
-
+
public XmlRecordReader(FileSplit split, JobConf jobConf) throws IOException {
startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");
-
+
// open the file and seek to the start of the split
start = split.getStart();
end = start + split.getLength();
@@ -66,7 +73,7 @@
fsin = fs.open(split.getPath());
fsin.seek(start);
}
-
+
@Override
public boolean next(LongWritable key, Text value) throws IOException {
if (fsin.getPos() < end) {
@@ -85,32 +92,32 @@
}
return false;
}
-
+
@Override
public LongWritable createKey() {
return new LongWritable();
}
-
+
@Override
public Text createValue() {
return new Text();
}
-
+
@Override
public long getPos() throws IOException {
return fsin.getPos();
}
-
+
@Override
public void close() throws IOException {
fsin.close();
}
-
+
@Override
public float getProgress() throws IOException {
- return ((fsin.getPos() - start) / (float) (end - start));
+ return (fsin.getPos() - start) / (float) (end - start);
}
-
+
private boolean readUntilMatch(byte[] match, boolean withinBlock) throws IOException {
int i = 0;
while (true) {
@@ -119,7 +126,7 @@
if (b == -1) return false;
// save to buffer:
if (withinBlock) buffer.write(b);
-
+
// check if we're matching:
if (b == match[i]) {
i++;
@@ -130,4 +137,4 @@
}
}
}
-}
\ No newline at end of file
+}
\ No newline at end of file