You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/01/22 15:38:53 UTC

svn commit: r902102 - /lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java

Author: srowen
Date: Fri Jan 22 14:38:53 2010
New Revision: 902102

URL: http://svn.apache.org/viewvc?rev=902102&view=rev
Log:
MAHOUT-250

Modified:
    lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java

Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=902102&r1=902101&r2=902102&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java Fri Jan 22 14:38:53 2010
@@ -20,9 +20,9 @@
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
+import java.net.URI;
 import java.text.DecimalFormat;
 import java.text.NumberFormat;
 
@@ -34,6 +34,9 @@
 import org.apache.commons.cli2.builder.DefaultOptionBuilder;
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.compress.BZip2Codec;
 import org.apache.hadoop.io.compress.CompressionCodec;
 import org.apache.mahout.common.FileLineIterator;
@@ -64,6 +67,21 @@
           abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
         .withDescription("The output directory to place the splits in")
         .withShortName("o").create();
+
+    Option s3IdOpt = obuilder
+        .withLongName("s3ID")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
+        .withDescription("Amazon S3 ID key")
+        .withShortName("i").create();
+    Option s3SecretOpt = obuilder
+        .withLongName("s3Secret")
+        .withRequired(false)
+        .withArgument(
+          abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
+        .withDescription("Amazon S3 secret key")
+        .withShortName("s").create();
     
     Option chunkSizeOpt = obuilder
         .withLongName("chunkSize")
@@ -82,14 +100,29 @@
         .withShortName("n").create();
     Group group = gbuilder.withName("Options").withOption(dumpFileOpt)
         .withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(
-          numChunksOpt).create();
+          numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create();
     
     Parser parser = new Parser();
     parser.setGroup(group);
     CommandLine cmdLine = parser.parse(args);
-    
+
+    Configuration conf = new Configuration();
     String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
     String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
+
+    if (cmdLine.hasOption(s3IdOpt)) {
+      String id = (String) cmdLine.getValue(s3IdOpt);
+      conf.set("fs.s3n.awsAccessKeyId", id);
+      conf.set("fs.s3.awsAccessKeyId", id);
+    }
+    if (cmdLine.hasOption(s3SecretOpt)) {
+      String secret = (String) cmdLine.getValue(s3SecretOpt);
+      conf.set("fs.s3n.awsSecretAccessKey", secret);
+      conf.set("fs.s3.awsSecretAccessKey", secret);
+    }
+    // do not compute crc file when using local FS
+    conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
+    FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);
     
     int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine
         .getValue(chunkSizeOpt));
@@ -98,9 +131,7 @@
     if (cmdLine.hasOption(numChunksOpt)) {
       numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
     }
-    
-    File dir = new File(outputDirPath);
-    dir.mkdirs();
+
     String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
                     + "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
                     + "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
@@ -137,7 +168,6 @@
     
     StringBuilder content = new StringBuilder();
     content.append(header);
-    int filenumber = 0;
     NumberFormat decimalFormatter = new DecimalFormat("0000");
     File dumpFile = new File(dumpFilePath);
     FileLineIterator it;
@@ -149,6 +179,7 @@
       // assume the user has previously de-compressed the dump file
       it = new FileLineIterator(dumpFile);
     }
+    int filenumber = 0;
     while (it.hasNext()) {
       String thisLine = it.next();
       if (thisLine.trim().startsWith("<page>")) {
@@ -167,11 +198,9 @@
         if (content.length() > chunkSize || end) {
           content.append("</mediawiki>");
           filenumber++;
-          
+          String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml";
           BufferedWriter chunkWriter = new BufferedWriter(
-              new OutputStreamWriter(new FileOutputStream(
-                  dir.getPath() + "/chunk-"
-                      + decimalFormatter.format(filenumber) + ".xml"), "UTF-8"));
+              new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
           
           chunkWriter.write(content.toString(), 0, content.length());
           chunkWriter.close();