You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/01/22 15:38:53 UTC
svn commit: r902102 -
/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
Author: srowen
Date: Fri Jan 22 14:38:53 2010
New Revision: 902102
URL: http://svn.apache.org/viewvc?rev=902102&view=rev
Log:
MAHOUT-250
Modified:
lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java?rev=902102&r1=902101&r2=902102&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java (original)
+++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaXmlSplitter.java Fri Jan 22 14:38:53 2010
@@ -20,9 +20,9 @@
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.net.URI;
import java.text.DecimalFormat;
import java.text.NumberFormat;
@@ -34,6 +34,9 @@
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.mahout.common.FileLineIterator;
@@ -64,6 +67,21 @@
abuilder.withName("outputDir").withMinimum(1).withMaximum(1).create())
.withDescription("The output directory to place the splits in")
.withShortName("o").create();
+
+ Option s3IdOpt = obuilder
+ .withLongName("s3ID")
+ .withRequired(false)
+ .withArgument(
+ abuilder.withName("s3Id").withMinimum(1).withMaximum(1).create())
+ .withDescription("Amazon S3 ID key")
+ .withShortName("i").create();
+ Option s3SecretOpt = obuilder
+ .withLongName("s3Secret")
+ .withRequired(false)
+ .withArgument(
+ abuilder.withName("s3Secret").withMinimum(1).withMaximum(1).create())
+ .withDescription("Amazon S3 secret key")
+ .withShortName("s").create();
Option chunkSizeOpt = obuilder
.withLongName("chunkSize")
@@ -82,14 +100,29 @@
.withShortName("n").create();
Group group = gbuilder.withName("Options").withOption(dumpFileOpt)
.withOption(outputDirOpt).withOption(chunkSizeOpt).withOption(
- numChunksOpt).create();
+ numChunksOpt).withOption(s3IdOpt).withOption(s3SecretOpt).create();
Parser parser = new Parser();
parser.setGroup(group);
CommandLine cmdLine = parser.parse(args);
-
+
+ Configuration conf = new Configuration();
String dumpFilePath = (String) cmdLine.getValue(dumpFileOpt);
String outputDirPath = (String) cmdLine.getValue(outputDirOpt);
+
+ if (cmdLine.hasOption(s3IdOpt)) {
+ String id = (String) cmdLine.getValue(s3IdOpt);
+ conf.set("fs.s3n.awsAccessKeyId", id);
+ conf.set("fs.s3.awsAccessKeyId", id);
+ }
+ if (cmdLine.hasOption(s3SecretOpt)) {
+ String secret = (String) cmdLine.getValue(s3SecretOpt);
+ conf.set("fs.s3n.awsSecretAccessKey", secret);
+ conf.set("fs.s3.awsSecretAccessKey", secret);
+ }
+ // do not compute crc file when using local FS
+ conf.set("fs.file.impl", "org.apache.hadoop.fs.RawLocalFileSystem");
+ FileSystem fs = FileSystem.get(URI.create(outputDirPath), conf);
int chunkSize = 1024 * 1024 * Integer.parseInt((String) cmdLine
.getValue(chunkSizeOpt));
@@ -98,9 +131,7 @@
if (cmdLine.hasOption(numChunksOpt)) {
numChunks = Integer.parseInt((String) cmdLine.getValue(numChunksOpt));
}
-
- File dir = new File(outputDirPath);
- dir.mkdirs();
+
String header = "<mediawiki xmlns=\"http://www.mediawiki.org/xml/export-0.3/\" "
+ "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
+ "xsi:schemaLocation=\"http://www.mediawiki.org/xml/export-0.3/ "
@@ -137,7 +168,6 @@
StringBuilder content = new StringBuilder();
content.append(header);
- int filenumber = 0;
NumberFormat decimalFormatter = new DecimalFormat("0000");
File dumpFile = new File(dumpFilePath);
FileLineIterator it;
@@ -149,6 +179,7 @@
// assume the user has previously de-compressed the dump file
it = new FileLineIterator(dumpFile);
}
+ int filenumber = 0;
while (it.hasNext()) {
String thisLine = it.next();
if (thisLine.trim().startsWith("<page>")) {
@@ -167,11 +198,9 @@
if (content.length() > chunkSize || end) {
content.append("</mediawiki>");
filenumber++;
-
+ String filename = outputDirPath + "/chunk-" + decimalFormatter.format(filenumber) + ".xml";
BufferedWriter chunkWriter = new BufferedWriter(
- new OutputStreamWriter(new FileOutputStream(
- dir.getPath() + "/chunk-"
- + decimalFormatter.format(filenumber) + ".xml"), "UTF-8"));
+ new OutputStreamWriter(fs.create(new Path(filename)), "UTF-8"));
chunkWriter.write(content.toString(), 0, content.length());
chunkWriter.close();