You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/03/25 10:50:23 UTC
svn commit: r1460571 [2/2] - in /mahout/trunk:
examples/src/main/java/org/apache/mahout/cf/taste/example/bookcrossing/
examples/src/main/java/org/apache/mahout/cf/taste/example/email/
examples/src/main/java/org/apache/mahout/cf/taste/example/jester/ ex...
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorDumper.java Mon Mar 25 09:50:22 2013
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Utils.OutputFileUtils.OutputFilesFilter;
import org.apache.hadoop.util.ToolRunner;
@@ -48,7 +47,7 @@ import java.util.Iterator;
import java.util.Set;
/**
- * Can read in a {@link SequenceFile} of {@link Vector}s and dump
+ * Can read in a {@link org.apache.hadoop.io.SequenceFile} of {@link Vector}s and dump
* out the results using {@link Vector#asFormatString()} to either the console or to a
* file.
*/
@@ -76,10 +75,13 @@ public final class VectorDumper extends
addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true", false);
addOption("dictionary", "d", "The dictionary file.", false);
addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
- addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries", false);
- addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector (if the vector is one) printing out the name", false);
+ addOption("csv", "c", "Output the Vector as CSV. Otherwise it substitutes in the terms for vector cell entries",
+ false);
+ addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector " +
+ "(if the vector is one) printing out the name", false);
addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)", false);
- addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude descending order", false);
+ addOption("sortVectors", "sort", "Sort output key/value pairs of the vector entries in abs magnitude " +
+ "descending order", false);
addOption("quiet", "q", "Print only file contents", false);
addOption("sizeOnly", "sz", "Dump only the size of the vector", false);
addOption("numItems", "ni", "Output at most <n> vecors", false);
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Mon Mar 25 09:50:22 2013
@@ -23,7 +23,6 @@ import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
-import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.lucene.util.PriorityQueue;
import org.apache.mahout.common.Pair;
@@ -170,7 +169,7 @@ public final class VectorHelper {
}
/**
- * Read a dictionary in {@link SequenceFile} generated by
+ * Read a dictionary in {@link org.apache.hadoop.io.SequenceFile} generated by
* {@link org.apache.mahout.vectorizer.DictionaryVectorizer}
*
* @param filePattern <PATH TO DICTIONARY>/dictionary.file-*
@@ -217,7 +216,7 @@ public final class VectorHelper {
return result;
}
- private static class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> {
+ private static final class TDoublePQ<T> extends PriorityQueue<Pair<T, Double>> {
private final T sentinel;
private TDoublePQ(T sentinel, int size) {
@@ -226,8 +225,7 @@ public final class VectorHelper {
}
@Override
- protected boolean lessThan(Pair<T, Double> a,
- Pair<T, Double> b) {
+ protected boolean lessThan(Pair<T, Double> a, Pair<T, Double> b) {
return a.getSecond().compareTo(b.getSecond()) < 0;
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/csv/CSVVectorIterator.java Mon Mar 25 09:50:22 2013
@@ -29,7 +29,8 @@ import org.apache.mahout.math.Vector;
/**
* Iterates a CSV file and produces {@link org.apache.mahout.math.Vector}.
* <br/>
- * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()} method.
+ * The Iterator returned throws {@link UnsupportedOperationException} for the {@link java.util.Iterator#remove()}
+ * method.
* <p/>
* Assumes DenseVector for now, but in the future may have the option of mapping columns to sparse format
* <p/>
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/AbstractLuceneIterator.java Mon Mar 25 09:50:22 2013
@@ -50,15 +50,16 @@ public abstract class AbstractLuceneIter
protected long nextLogRecord = bump.increment();
protected int skippedErrorMessages;
- public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, double maxPercentErrorDocs, String field) {
- this.terminfo = terminfo;
- this.normPower = normPower;
- this.indexReader = indexReader;
-
- this.weight = weight;
- this.nextDocId = 0;
- this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
- this.field = field;
+ public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight,
+ double maxPercentErrorDocs, String field) {
+ this.terminfo = terminfo;
+ this.normPower = normPower;
+ this.indexReader = indexReader;
+
+ this.weight = weight;
+ this.nextDocId = 0;
+ this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
+ this.field = field;
}
/**
@@ -93,7 +94,8 @@ public abstract class AbstractLuceneIter
numErrorDocs++;
if (numErrorDocs >= maxErrorDocs) {
log.error("There are too many documents that do not have a term vector for {}", field);
- throw new IllegalStateException("There are too many documents that do not have a term vector for " + field);
+ throw new IllegalStateException("There are too many documents that do not have a term vector for " +
+ field);
}
if (numErrorDocs >= nextLogRecord) {
if (skippedErrorMessages == 0) {
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/ClusterLabels.java Mon Mar 25 09:50:22 2013
@@ -198,7 +198,8 @@ public class ClusterLabels {
DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
int docID;
while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
- if (liveDocs != null && !liveDocs.get(docID)) { //check to see if we don't have an deletions (null) or if document is live
+ //check to see if we don't have an deletions (null) or if document is live
+ if (liveDocs != null && !liveDocs.get(docID)) {
// document is deleted...
termBitset.set(docsEnum.docID());
}
@@ -243,9 +244,9 @@ public class ClusterLabels {
OpenBitSet bitset = new OpenBitSet(numDocs);
- Set<String> idFieldSelector= null;
- if(idField !=null){
- idFieldSelector= new TreeSet<String>();
+ Set<String> idFieldSelector = null;
+ if (idField != null) {
+ idFieldSelector = new TreeSet<String>();
idFieldSelector.add(idField);
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java Mon Mar 25 09:50:22 2013
@@ -98,7 +98,8 @@ public final class Driver {
LuceneIterable iterable;
if (norm == LuceneIterable.NO_NORMALIZING) {
- iterable = new LuceneIterable(reader, idField, field, termInfo,weight, LuceneIterable.NO_NORMALIZING, maxPercentErrorDocs);
+ iterable = new LuceneIterable(reader, idField, field, termInfo,weight, LuceneIterable.NO_NORMALIZING,
+ maxPercentErrorDocs);
} else {
iterable = new LuceneIterable(reader, idField, field, termInfo,weight, norm, maxPercentErrorDocs);
}
@@ -181,8 +182,8 @@ public final class Driver {
Option maxPercentErrorDocsOpt = obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
"The max percentage of docs that can have a null term vector. These are noise document and can occur if the "
- + "analyzer used strips out all terms in the target field. This percentage is expressed as a value between 0 and 1. " +
- "The default is 0.").withShortName("err").create();
+ + "analyzer used strips out all terms in the target field. This percentage is expressed as a value " +
+ "between 0 and 1. The default is 0.").withShortName("err").create();
Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h")
.create();
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java Mon Mar 25 09:50:22 2013
@@ -43,7 +43,8 @@ public final class LuceneIterable implem
this(reader, idField, field, terminfo, weight, NO_NORMALIZING);
}
- public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight, double normPower) {
+ public LuceneIterable(IndexReader indexReader, String idField, String field, TermInfo terminfo, Weight weight,
+ double normPower) {
this(indexReader, idField, field, terminfo, weight, normPower, 0);
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1460571&r1=1460570&r2=1460571&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java Mon Mar 25 09:50:22 2013
@@ -19,18 +19,17 @@ package org.apache.mahout.utils.vectors.
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
-import org.apache.mahout.math.Vector;
import org.apache.mahout.utils.vectors.TermInfo;
import org.apache.mahout.vectorizer.Weight;
import java.io.IOException;
-import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
/**
- * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the source for creating the
- * {@link Vector}s. The field used to create the vectors currently must have term vectors stored for it.
+ * An {@link java.util.Iterator} over {@link org.apache.mahout.math.Vector}s that uses a Lucene index as the source
+ * for creating the {@link org.apache.mahout.math.Vector}s. The field used to create the vectors currently must have
+ * term vectors stored for it.
*/
public class LuceneIterator extends AbstractLuceneIterator {
protected final Set<String> idFieldSelector;
@@ -63,7 +62,8 @@ public class LuceneIterator extends Abst
* @param weight weight
* @param normPower the normalization value. Must be nonnegative, or {@link LuceneIterable#NO_NORMALIZING}
* @param maxPercentErrorDocs most documents that will be tolerated without a term freq vector. In [0,1].
- * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo, org.apache.mahout.vectorizer.Weight, double)
+ * @see #LuceneIterator(org.apache.lucene.index.IndexReader, String, String, org.apache.mahout.utils.vectors.TermInfo,
+ * org.apache.mahout.vectorizer.Weight, double)
*/
public LuceneIterator(IndexReader indexReader,
String idField,