You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/23 20:23:20 UTC
svn commit: r787776 [3/3] - in /lucene/mahout/trunk:
core/src/main/java/org/apache/mahout/clustering/canopy/
core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/
core/src/main/java/org/apache/mahout/clustering/kmeans/
core/src/main/java/org/apa...
Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java?rev=787776&r1=787775&r2=787776&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java Tue Jun 23 18:23:18 2009
@@ -25,10 +25,19 @@
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
-import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.vectors.io.JWriterTermInfoWriter;
+import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
+import org.apache.mahout.utils.vectors.io.VectorWriter;
+import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
import org.apache.mahout.utils.vectors.lucene.LuceneIteratable;
import org.apache.mahout.utils.vectors.lucene.TFDFMapper;
@@ -39,11 +48,10 @@
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.io.FileWriter;
import java.nio.charset.Charset;
-import java.util.Iterator;
/**
@@ -53,7 +61,7 @@
public class Driver {
private transient static Logger log = LoggerFactory.getLogger(Driver.class);
//TODO: This assumes LuceneIterable, make it generic.
-
+
public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
@@ -75,7 +83,7 @@
abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
withDescription("The field in the index containing the index. If null, then the Lucene internal doc " +
"id is used which is prone to error if the underlying index changes").withShortName("i").create();
-
+
Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
withDescription("The output of the dictionary").withShortName("t").create();
@@ -94,10 +102,15 @@
Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
withDescription("The maximum number of vectors to output. If not specified, then it will loop over all docs").withShortName("m").create();
+
+ Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument(
+ abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).
+ withDescription("The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)").withShortName("e").create();
+
Option helpOpt = obuilder.withLongName("help").
withDescription("Print out help").withShortName("h").create();
Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
- .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt)
+ .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt)
.withOption(weightOpt).create();
try {
Parser parser = new Parser();
@@ -123,9 +136,9 @@
Directory dir = FSDirectory.open(file);
IndexReader reader = IndexReader.open(dir, true);
Weight weight = null;
- if(cmdLine.hasOption(weightOpt)) {
+ if (cmdLine.hasOption(weightOpt)) {
String wString = cmdLine.getValue(weightOpt).toString();
- if(wString.equalsIgnoreCase("tf")) {
+ if (wString.equalsIgnoreCase("tf")) {
weight = new TF();
} else if (wString.equalsIgnoreCase("tfidf")) {
weight = new TFIDF();
@@ -150,7 +163,7 @@
}
}
String idField = null;
- if (cmdLine.hasOption(idFieldOpt)){
+ if (cmdLine.hasOption(idFieldOpt)) {
idField = cmdLine.getValue(idFieldOpt).toString();
}
if (norm == LuceneIteratable.NO_NORMALIZING) {
@@ -158,54 +171,32 @@
} else {
iteratable = new LuceneIteratable(reader, idField, field, mapper, norm);
}
- File outFile = new File(cmdLine.getValue(outputOpt).toString());
+ String outFile = cmdLine.getValue(outputOpt).toString();
log.info("Output File: " + outFile);
- BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
- int i = 0;
- for (Vector vector : iteratable) {
- if (i >= maxDocs){
- break;
- }
- writer.write(vector.asFormatString());
- writer.write("\n");
- if (i % 500 == 0) {
- log.info("i = " + i);
+
+ VectorWriter vectorWriter;
+ if (cmdLine.hasOption(outWriterOpt)) {
+ String outWriter = cmdLine.getValue(outWriterOpt).toString();
+ if (outWriter.equals("file")) {
+ BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+ vectorWriter = new JWriterVectorWriter(writer);
+ } else {
+ vectorWriter = getSeqFileWriter(outFile);
}
- i++;
+ } else {
+ vectorWriter = getSeqFileWriter(outFile);
}
- log.info("Wrote " + i + " vectors");
- writer.flush();
- writer.close();
- // TODO: replace with aa codec
+
+ vectorWriter.write(iteratable);
+ vectorWriter.close();
+
+ String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
log.info("Dictionary Output file: " + dictOutFile);
- writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
- Iterator<TermEntry> entIter = termInfo.getAllEntries();
- String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
- writer.write("input");
- writer.write(delimiter);
- writer.write(file.getAbsolutePath());
- writer.write("\n");
- writer.write("field");
- writer.write(delimiter);
- writer.write(field);
- writer.write("\n");
- writer.write("num.terms");
- writer.write(delimiter);
- writer.write(String.valueOf(termInfo.totalTerms(field)));
- writer.write("\n");
- writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
- writer.write("\n");
- while (entIter.hasNext()) {
- TermEntry entry = entIter.next();
- writer.write(entry.term);
- writer.write(delimiter);
- writer.write(String.valueOf(entry.docFreq));
- writer.write(delimiter);
- writer.write(String.valueOf(entry.termIdx));
- writer.write("\n");
- }
- writer.flush();
+ BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+ JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
+ tiWriter.write(termInfo);
+ tiWriter.close();
writer.close();
}
}
@@ -216,6 +207,18 @@
}
}
+ private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
+ VectorWriter sfWriter;
+ Path path = new Path(outFile);
+ Configuration conf = new Configuration();
+ FileSystem fs = FileSystem.get(conf);
+ //TODO: Make this parameter driven
+ SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, SparseVector.class);
+
+ sfWriter = new SequenceFileVectorWriter(seqWriter);
+ return sfWriter;
+ }
+
private static void printHelp(Group group) {
HelpFormatter formatter = new HelpFormatter();
formatter.setGroup(group);
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,59 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Iterator;
+
+
+/**
+ * Write ther TermInfo out to a {@link java.io.Writer}
+ *
+ **/
+public class JWriterTermInfoWriter implements TermInfoWriter {
+ private transient static Logger log = LoggerFactory.getLogger(JWriterTermInfoWriter.class);
+
+ protected Writer writer;
+ protected String delimiter;
+ protected String field;
+
+ public JWriterTermInfoWriter(Writer writer, String delimiter, String field) {
+ this.writer = writer;
+ this.delimiter = delimiter;
+ this.field = field;
+ }
+
+ @Override
+ public void write(TermInfo ti) throws IOException {
+
+ Iterator<TermEntry> entIter = ti.getAllEntries();
+
+ writer.write(String.valueOf(ti.totalTerms(field)));
+ writer.write("\n");
+ writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
+ writer.write("\n");
+ while (entIter.hasNext()) {
+ TermEntry entry = entIter.next();
+ writer.write(entry.term);
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.docFreq));
+ writer.write(delimiter);
+ writer.write(String.valueOf(entry.termIdx));
+ writer.write("\n");
+ }
+ writer.flush();
+ writer.close();
+ }
+
+ /**
+ * Does NOT close the underlying writer
+ * @throws IOException
+ */
+ public void close() throws IOException {
+
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,46 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.IOException;
+import java.io.Writer;
+
+
+/**
+ *
+ *
+ **/
+public class JWriterVectorWriter implements VectorWriter {
+ protected Writer writer;
+
+ public JWriterVectorWriter(Writer writer) {
+ this.writer = writer;
+ }
+
+ @Override
+ public long write(VectorIterable iterable) throws IOException {
+ return write(iterable, Long.MAX_VALUE);
+ }
+
+ @Override
+ public long write(VectorIterable iterable, long maxDocs) throws IOException {
+ long result = 0;
+
+ for (Vector vector : iterable) {
+ if (result >= maxDocs) {
+ break;
+ }
+ writer.write(vector.asFormatString());
+ writer.write("\n");
+
+ result++;
+ }
+ return result;
+ }
+
+ @Override
+ public void close() throws IOException {
+
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,47 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.VectorIterable;
+import org.apache.mahout.matrix.Vector;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.LongWritable;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public class SequenceFileVectorWriter implements VectorWriter {
+ protected SequenceFile.Writer writer;
+
+ public SequenceFileVectorWriter(SequenceFile.Writer writer) {
+ this.writer = writer;
+ }
+
+ @Override
+ public long write(VectorIterable iterable, long maxDocs) throws IOException {
+ long i = 0;
+ long recNum = 0;
+ for (Vector point : iterable) {
+ if (i >= maxDocs) {
+ break;
+ }
+ //point.write(dataOut);
+ writer.append(new LongWritable(recNum++), point);
+
+ i++;
+ }
+ return i;
+ }
+
+ @Override
+ public long write(VectorIterable iterable) throws IOException {
+ return write(iterable, Long.MAX_VALUE);
+ }
+
+ @Override
+ public void close() throws IOException {
+
+ }
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,17 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.TermInfo;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public interface TermInfoWriter {
+
+ public void write(TermInfo ti) throws IOException;
+
+ public void close() throws IOException;
+}
Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,18 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public interface VectorWriter {
+ public long write(VectorIterable iterable) throws IOException;
+
+ public long write(VectorIterable iterable, long maxDocs) throws IOException;
+
+ public void close() throws IOException;
+}