You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/03/25 10:50:23 UTC

svn commit: r1460571 [2/2] - in /mahout/trunk: examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/ examples/src/main/java/org/apache/mahout/cf/taste/example/email/ examples/src/main/java/org/apache/mahout/cf/taste/example/jester/ ex...

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Mon Mar 25 09:50:22 2013
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
 import org.apache.hadoop.util.ToolRunner;
@@ -48,7 +47,7 @@ import java.util.Iterator;
 import java.util.Set;
 
 /**
- * Can read in a {@link SequenceFile} of {@link Vector}s and dump
+ * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump
  * out the results using {@link Vector#asFormatString()} to either the console or to a
  * file.
  */
@@ -76,10 +75,13 @@ public final class VectorDumper extends 
     addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true", false);
     addOption("dictionary", "d", "The dictionary file.", false);
     addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
-    addOption("csv", "c", "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries", false);
-    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name", false);
+    addOption("csv", "c", "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries",
+        false);
+    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " +
+        "(if the vector is one) printing out the name", false);
     addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)", false);
-    addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude descending order", false);
+    addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " +
+        "descending order", false);
     addOption("quiet", "q", "Print only file contents", false);
     addOption("sizeOnly", "sz", "Dump only the size of the vector", false);
     addOption("numItems", "ni", "Output at most <n> vecors", false);

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Mon Mar 25 09:50:22 2013
@@ -23,7 +23,6 @@ import com.google.common.collect.Lists;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.mahout.common.Pair;
@@ -170,7 +169,7 @@ public final class VectorHelper {
   }
 
   /**
-   * Read a dictionary in {@link SequenceFile} generated by
+   * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated by
    * {@link org.apache.mahout.vectorizer.DictionaryVectorizer}
    *
    * @param filePattern <PATH TO DICTIONARY>/dictionary.file-*
@@ -217,7 +216,7 @@ public final class VectorHelper {
     return result;
   }
 
-  private static class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> {
+  private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> {
     private final T sentinel;
 
     private TDoublePQ(T sentinel, int size) {
@@ -226,8 +225,7 @@ public final class VectorHelper {
     }
 
     @Override
-    protected boolean lessThan(Pair<T, Double> a,
-                               Pair<T, Double> b) {
+    protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) {
       return a.getSecond().compareTo(b.getSecond()) < 0;
     }
 

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java Mon Mar 25 09:50:22 2013
@@ -29,7 +29,8 @@ import org.apache.mahout.math.Vector;
 /**
  * Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
  * <br/>
- * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
+ * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()}
+ * method.
  * <p/>
  * Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
  * <p/>

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java Mon Mar 25 09:50:22 2013
@@ -50,15 +50,16 @@ public abstract class AbstractLuceneIter
     protected long nextLogRecord = bump.increment();
     protected int skippedErrorMessages;
 
-    public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, double maxPercentErrorDocs, String field) {
-        this.terminfo = terminfo;
-        this.normPower = normPower;
-        this.indexReader = indexReader;
-
-        this.weight = weight;
-        this.nextDocId = 0;
-        this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
-        this.field = field;
+    public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight,
+        double maxPercentErrorDocs, String field) {
+      this.terminfo = terminfo;
+      this.normPower = normPower;
+      this.indexReader = indexReader;
+
+      this.weight = weight;
+      this.nextDocId = 0;
+      this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
+      this.field = field;
     }
 
     /**
@@ -93,7 +94,8 @@ public abstract class AbstractLuceneIter
             numErrorDocs++;
             if (numErrorDocs >= maxErrorDocs) {
               log.error("There are too many documents that do not have a term vector for {}", field);
-              throw new IllegalStateException("There are too many documents that do not have a term vector for " + field);
+              throw new IllegalStateException("There are too many documents that do not have a term vector for " +
+                  field);
             }
             if (numErrorDocs >= nextLogRecord) {
               if (skippedErrorMessages == 0) {

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Mon Mar 25 09:50:22 2013
@@ -198,7 +198,8 @@ public class ClusterLabels {
       DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
       int docID;
       while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
-        if (liveDocs != null && !liveDocs.get(docID)) { //check to see if we don't have an deletions (null) or if document is live
+        //check to see if we don't have an deletions (null) or if document is live
+        if (liveDocs != null && !liveDocs.get(docID)) {
           // document is deleted...
           termBitset.set(docsEnum.docID());
         }
@@ -243,9 +244,9 @@ public class ClusterLabels {
 
     OpenBitSet bitset = new OpenBitSet(numDocs);
     
-    Set<String>  idFieldSelector= null;
-    if(idField !=null){
-      idFieldSelector= new TreeSet<String>();
+    Set<String>  idFieldSelector = null;
+    if (idField != null) {
+      idFieldSelector = new TreeSet<String>();
       idFieldSelector.add(idField);
     }
     

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Mon Mar 25 09:50:22 2013
@@ -98,7 +98,8 @@ public final class Driver {
     
     LuceneIterable iterable;
     if (norm == LuceneIterable.NO_NORMALIZING) {
-      iterable = new LuceneIterable(reader, idField, field, termInfo,weight, LuceneIterable.NO_NORMALIZING, maxPercentErrorDocs);
+      iterable = new LuceneIterable(reader, idField, field, termInfo,weight, LuceneIterable.NO_NORMALIZING,
+          maxPercentErrorDocs);
     } else {
       iterable = new LuceneIterable(reader, idField, field, termInfo,weight, norm, maxPercentErrorDocs);
     }
@@ -181,8 +182,8 @@ public final class Driver {
     Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
         abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
         "The max percentage of docs that can have a null term vector. These are noise document and can occur if the " 
-            + "analyzer used strips out all terms in the target field. This percentage is expressed as a value between 0 and 1. " +
-            "The default is 0.").withShortName("err").create();
+            + "analyzer used strips out all terms in the target field. This percentage is expressed as a value " +
+            "between 0 and 1. The default is 0.").withShortName("err").create();
 
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
         .create();

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Mon Mar 25 09:50:22 2013
@@ -43,7 +43,8 @@ public final class LuceneIterable implem
     this(reader, idField, field, terminfo, weight, NO_NORMALIZING);
   }
 
-  public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight, double normPower) {
+  public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight,
+      double normPower) {
     this(indexReader, idField, field, terminfo, weight, normPower, 0);
   }
 

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java Mon Mar 25 09:50:22 2013
@@ -19,18 +19,17 @@ package org.apache.mahout.utils.vectors.
 
 import com.google.common.base.Preconditions;
 import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.math.Vector;
 import org.apache.mahout.utils.vectors.TermInfo;
 import org.apache.mahout.vectorizer.Weight;
 
 import java.io.IOException;
-import java.util.Iterator;
 import java.util.Set;
 import java.util.TreeSet;
 
 /**
- * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the source for creating the
- * {@link Vector}s. The field used to create the vectors currently must have term vectors stored for it.
+ * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
+ * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
+ * term vectors stored for it.
  */
 public class LuceneIterator extends AbstractLuceneIterator {
     protected final Set<String> idFieldSelector;
@@ -63,7 +62,8 @@ public class LuceneIterator extends Abst
    * @param weight     weight
    * @param normPower  the normalization value. Must be nonnegative, or {@link LuceneIterable#NO_NORMALIZING}
    * @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
-   * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo, org.apache.mahout.vectorizer.Weight, double)
+   * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
+   * org.apache.mahout.vectorizer.Weight, double)
    */
   public LuceneIterator(IndexReader indexReader,
                         String idField,