You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2009/06/23 20:23:20 UTC

svn commit: r787776 [3/3] - in /lucene/mahout/trunk: core/src/main/java/org/apache/mahout/clustering/canopy/ core/src/main/java/org/apache/mahout/clustering/fuzzykmeans/ core/src/main/java/org/apache/mahout/clustering/kmeans/ core/src/main/java/org/apa...

Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java?rev=787776&r1=787775&r2=787776&view=diff
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java (original)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/Driver.java Tue Jun 23 18:23:18 2009
@@ -25,10 +25,19 @@
 import org.apache.commons.cli2.builder.GroupBuilder;
 import org.apache.commons.cli2.commandline.Parser;
 import org.apache.commons.cli2.util.HelpFormatter;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
-import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.matrix.SparseVector;
+import org.apache.mahout.utils.vectors.io.JWriterTermInfoWriter;
+import org.apache.mahout.utils.vectors.io.SequenceFileVectorWriter;
+import org.apache.mahout.utils.vectors.io.VectorWriter;
+import org.apache.mahout.utils.vectors.io.JWriterVectorWriter;
 import org.apache.mahout.utils.vectors.lucene.CachedTermInfo;
 import org.apache.mahout.utils.vectors.lucene.LuceneIteratable;
 import org.apache.mahout.utils.vectors.lucene.TFDFMapper;
@@ -39,11 +48,10 @@
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileOutputStream;
-import java.io.FileWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
+import java.io.FileWriter;
 import java.nio.charset.Charset;
-import java.util.Iterator;
 
 
 /**
@@ -53,7 +61,7 @@
 public class Driver {
   private transient static Logger log = LoggerFactory.getLogger(Driver.class);
   //TODO: This assumes LuceneIterable, make it generic.
-  
+
   public static void main(String[] args) throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
@@ -75,7 +83,7 @@
             abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).
             withDescription("The field in the index containing the index.  If null, then the Lucene internal doc " +
                     "id is used which is prone to error if the underlying index changes").withShortName("i").create();
-    
+
     Option dictOutOpt = obuilder.withLongName("dictOut").withRequired(true).withArgument(
             abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).
             withDescription("The output of the dictionary").withShortName("t").create();
@@ -94,10 +102,15 @@
     Option maxOpt = obuilder.withLongName("max").withRequired(false).withArgument(
             abuilder.withName("max").withMinimum(1).withMaximum(1).create()).
             withDescription("The maximum number of vectors to output.  If not specified, then it will loop over all docs").withShortName("m").create();
+
+    Option outWriterOpt = obuilder.withLongName("outputWriter").withRequired(false).withArgument(
+            abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).
+            withDescription("The VectorWriter to use, either seq (SequenceFileVectorWriter - default) or file (Writes to a File using JSON format)").withShortName("e").create();
+
     Option helpOpt = obuilder.withLongName("help").
             withDescription("Print out help").withShortName("h").create();
     Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(outputOpt).withOption(delimiterOpt)
-            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt)
+            .withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt).withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt)
             .withOption(weightOpt).create();
     try {
       Parser parser = new Parser();
@@ -123,9 +136,9 @@
           Directory dir = FSDirectory.open(file);
           IndexReader reader = IndexReader.open(dir, true);
           Weight weight = null;
-          if(cmdLine.hasOption(weightOpt)) {
+          if (cmdLine.hasOption(weightOpt)) {
             String wString = cmdLine.getValue(weightOpt).toString();
-            if(wString.equalsIgnoreCase("tf")) {
+            if (wString.equalsIgnoreCase("tf")) {
               weight = new TF();
             } else if (wString.equalsIgnoreCase("tfidf")) {
               weight = new TFIDF();
@@ -150,7 +163,7 @@
             }
           }
           String idField = null;
-          if (cmdLine.hasOption(idFieldOpt)){
+          if (cmdLine.hasOption(idFieldOpt)) {
             idField = cmdLine.getValue(idFieldOpt).toString();
           }
           if (norm == LuceneIteratable.NO_NORMALIZING) {
@@ -158,54 +171,32 @@
           } else {
             iteratable = new LuceneIteratable(reader, idField, field, mapper, norm);
           }
-          File outFile = new File(cmdLine.getValue(outputOpt).toString());
+          String outFile = cmdLine.getValue(outputOpt).toString();
           log.info("Output File: " + outFile);
-          BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
-          int i = 0;
-          for (Vector vector : iteratable) {
-            if (i >= maxDocs){
-              break;
-            }
-            writer.write(vector.asFormatString());
-            writer.write("\n");
-            if (i % 500 == 0) {
-              log.info("i = " + i);
+
+          VectorWriter vectorWriter;
+          if (cmdLine.hasOption(outWriterOpt)) {
+            String outWriter = cmdLine.getValue(outWriterOpt).toString();
+            if (outWriter.equals("file")) {
+              BufferedWriter writer = new BufferedWriter(new FileWriter(outFile));
+              vectorWriter = new JWriterVectorWriter(writer);
+            } else {
+              vectorWriter = getSeqFileWriter(outFile);
             }
-            i++;
+          } else {
+            vectorWriter = getSeqFileWriter(outFile);
           }
-          log.info("Wrote " + i + " vectors");
-          writer.flush();
-          writer.close();
-          // TODO: replace with aa codec
+
+          vectorWriter.write(iteratable);
+          vectorWriter.close();
+
+          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
           File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
           log.info("Dictionary Output file: " + dictOutFile);
-          writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
-          Iterator<TermEntry> entIter = termInfo.getAllEntries();
-          String delimiter = cmdLine.hasOption(delimiterOpt) ? cmdLine.getValue(delimiterOpt).toString() : "\t";
-          writer.write("input");
-          writer.write(delimiter);
-          writer.write(file.getAbsolutePath());
-          writer.write("\n");
-          writer.write("field");
-          writer.write(delimiter);
-          writer.write(field);
-          writer.write("\n");
-          writer.write("num.terms");
-          writer.write(delimiter);
-          writer.write(String.valueOf(termInfo.totalTerms(field)));
-          writer.write("\n");
-          writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
-          writer.write("\n");
-          while (entIter.hasNext()) {
-            TermEntry entry = entIter.next();
-            writer.write(entry.term);
-            writer.write(delimiter);
-            writer.write(String.valueOf(entry.docFreq));
-            writer.write(delimiter);
-            writer.write(String.valueOf(entry.termIdx));
-            writer.write("\n");
-          }
-          writer.flush();
+          BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+          JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, delimiter, field);
+          tiWriter.write(termInfo);
+          tiWriter.close();
           writer.close();
         }
       }
@@ -216,6 +207,18 @@
     }
   }
 
+  private static VectorWriter getSeqFileWriter(String outFile) throws IOException {
+    VectorWriter sfWriter;
+    Path path = new Path(outFile);
+    Configuration conf = new Configuration();
+    FileSystem fs = FileSystem.get(conf);
+    //TODO: Make this parameter driven
+    SequenceFile.Writer seqWriter = new SequenceFile.Writer(fs, conf, path, LongWritable.class, SparseVector.class);
+
+    sfWriter = new SequenceFileVectorWriter(seqWriter);
+    return sfWriter;
+  }
+
   private static void printHelp(Group group) {
     HelpFormatter formatter = new HelpFormatter();
     formatter.setGroup(group);

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterTermInfoWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,59 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.TermEntry;
+import org.apache.mahout.utils.vectors.TermInfo;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Iterator;
+
+
+/**
+ * Write ther TermInfo out to a {@link java.io.Writer}
+ *
+ **/
+public class JWriterTermInfoWriter implements TermInfoWriter {
+  private transient static Logger log = LoggerFactory.getLogger(JWriterTermInfoWriter.class);
+
+  protected Writer writer;
+  protected String delimiter;
+  protected String field;
+
+  public JWriterTermInfoWriter(Writer writer, String delimiter, String field) {
+    this.writer = writer;
+    this.delimiter = delimiter;
+    this.field = field;
+  }
+
+  @Override
+  public void write(TermInfo ti) throws IOException {
+
+    Iterator<TermEntry> entIter = ti.getAllEntries();
+
+    writer.write(String.valueOf(ti.totalTerms(field)));
+    writer.write("\n");
+    writer.write("#term" + delimiter + "doc freq" + delimiter + "idx");
+    writer.write("\n");
+    while (entIter.hasNext()) {
+      TermEntry entry = entIter.next();
+      writer.write(entry.term);
+      writer.write(delimiter);
+      writer.write(String.valueOf(entry.docFreq));
+      writer.write(delimiter);
+      writer.write(String.valueOf(entry.termIdx));
+      writer.write("\n");
+    }
+    writer.flush();
+    writer.close();
+  }
+
+  /**
+   * Does NOT close the underlying writer
+   * @throws IOException
+   */
+  public void close() throws IOException {
+
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/JWriterVectorWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,46 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.matrix.Vector;
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.IOException;
+import java.io.Writer;
+
+
+/**
+ *
+ *
+ **/
+public class JWriterVectorWriter implements VectorWriter {
+  protected Writer writer;
+
+  public JWriterVectorWriter(Writer writer) {
+    this.writer = writer;
+  }
+
+  @Override
+  public long write(VectorIterable iterable) throws IOException {
+    return write(iterable, Long.MAX_VALUE);
+  }
+
+  @Override
+  public long write(VectorIterable iterable, long maxDocs) throws IOException {
+    long result = 0;
+
+    for (Vector vector : iterable) {
+      if (result >= maxDocs) {
+        break;
+      }
+      writer.write(vector.asFormatString());
+      writer.write("\n");
+
+      result++;
+    }
+    return result;
+  }
+
+  @Override
+  public void close() throws IOException {
+
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/SequenceFileVectorWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,47 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.VectorIterable;
+import org.apache.mahout.matrix.Vector;
+import org.apache.hadoop.io.SequenceFile;
+import org.apache.hadoop.io.LongWritable;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public class SequenceFileVectorWriter implements VectorWriter {
+  protected SequenceFile.Writer writer;
+
+  public SequenceFileVectorWriter(SequenceFile.Writer writer) {
+    this.writer = writer;
+  }
+
+  @Override
+  public long write(VectorIterable iterable, long maxDocs) throws IOException {
+    long i = 0;
+    long recNum = 0;
+    for (Vector point : iterable) {
+      if (i >= maxDocs) {
+        break;
+      }
+      //point.write(dataOut);
+      writer.append(new LongWritable(recNum++), point);
+
+      i++;
+    }
+    return i;
+  }
+
+  @Override
+  public long write(VectorIterable iterable) throws IOException {
+    return write(iterable, Long.MAX_VALUE);
+  }
+
+  @Override
+  public void close() throws IOException {
+
+  }
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/TermInfoWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,17 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.TermInfo;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public interface TermInfoWriter {
+
+  public void write(TermInfo ti) throws IOException;
+
+  public void close() throws IOException;
+}

Added: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java?rev=787776&view=auto
==============================================================================
--- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java (added)
+++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/io/VectorWriter.java Tue Jun 23 18:23:18 2009
@@ -0,0 +1,18 @@
+package org.apache.mahout.utils.vectors.io;
+
+import org.apache.mahout.utils.vectors.VectorIterable;
+
+import java.io.IOException;
+
+
+/**
+ *
+ *
+ **/
+public interface VectorWriter {
+  public long write(VectorIterable iterable) throws IOException;
+
+  public long write(VectorIterable iterable, long maxDocs) throws IOException;
+
+  public void close() throws IOException;
+}