You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/01/22 05:41:24 UTC

svn commit: r901989 - /lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java

Author: srowen
Date: Fri Jan 22 04:41:24 2010
New Revision: 901989

URL: http://svn.apache.org/viewvc?rev=901989&view=rev
Log:
MAHOUT-250

Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=901989&r1=901988&r2=901989&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java Fri Jan 22 04:41:24 2010
@@ -19,6 +19,7 @@
 
 import java.io.BufferedWriter;
 import java.io.File;
+import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
@@ -33,6 +34,8 @@
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.io.compress.BZip2Codec;
+import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.mahout.common.FileLineIterator;
 
 /**
@@ -51,7 +54,7 @@
     Option dumpFileOpt = obuilder.withLongName("dumpFile").withRequired(true)
         .withArgument(
           abuilder.withName("dumpFile").withMinimum(1).withMaximum(1).create())
-        .withDescription("The path to the wikipedia dump file").withShortName(
+        .withDescription("The path to the wikipedia dump file (.bz2 or uncompressed)").withShortName(
           "d").create();
     
     Option outputDirOpt = obuilder
@@ -136,7 +139,16 @@
     content.append(header);
     int filenumber = 0;
     NumberFormat decimalFormatter = new DecimalFormat("0000");
-    FileLineIterator it = new FileLineIterator(new File(dumpFilePath));
+    File dumpFile = new File(dumpFilePath);
+    FileLineIterator it;
+    if (dumpFilePath.endsWith(".bz2")) {
+      // default compression format from http://download.wikimedia.org
+      CompressionCodec codec = new BZip2Codec();
+      it = new FileLineIterator(codec.createInputStream(new FileInputStream(dumpFile)));
+    } else {
+      // assume the user has previously de-compressed the dump file
+      it = new FileLineIterator(dumpFile);
+    }
     while (it.hasNext()) {
       String thisLine = it.next();
       if (thisLine.trim().startsWith("<page>")) {