You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/01/22 05:41:24 UTC
svn commit: r901989 -
/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
Author: srowen
Date: Fri Jan 22 04:41:24 2010
New Revision: 901989
URL: http://svn.apache.org/viewvc?rev=901989&view=rev
Log:
MAHOUT-250
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=901989&r1=901988&r2=901989&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java Fri Jan 22 04:41:24 2010
@@ -19,6 +19,7 @@
import java.io.BufferedWriter;
import java.io.File;
+import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
@@ -33,6 +34,8 @@
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.io.compress.BZip2Codec;
+import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.mahout.common.FileLineIterator;
/**
@@ -51,7 +54,7 @@
Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
.withArgument(
abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
- .withDescription("The path to the wikipedia dump file").withShortName(
+ .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName(
"d").create();
Option outputDirOpt = obuilder
@@ -136,7 +139,16 @@
content.append(header);
int filenumber = 0;
NumberFormat decimalFormatter = new DecimalFormat("0000");
- FileLineIterator it = new FileLineIterator(new File(dumpFilePath));
+ File dumpFile = new File(dumpFilePath);
+ FileLineIterator it;
+ if (dumpFilePath.endsWith(".bz2")) {
+ // default compression format from http://download.wikimedia.org
+ CompressionCodec codec = new BZip2Codec();
+ it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
+ } else {
+ // assume the user has previously de-compressed the dump file
+ it = new FileLineIterator(dumpFile);
+ }
while (it.hasNext()) {
String thisLine = it.next();
if (thisLine.trim().startsWith("<page>")) {