You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2012/05/08 22:07:06 UTC
svn commit: r1335732 - in /mahout/trunk/core/src:
main/java/org/apache/mahout/common/
main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/
test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/
Author: ssc
Date: Tue May 8 20:07:05 2012
New Revision: 1335732
URL: http://svn.apache.org/viewvc?rev=1335732&view=rev
Log:
MAHOUT-979 RowSimilarityJob should be able to infer the number of columns from the input matrix if not specified
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1335732&r1=1335731&r2=1335732&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java Tue May 8 20:07:05 2012
@@ -27,6 +27,7 @@ import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
+import com.google.common.io.Closeables;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
@@ -39,6 +40,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
@@ -51,6 +53,8 @@ import org.apache.hadoop.mapreduce.lib.o
import org.apache.hadoop.util.Tool;
import org.apache.lucene.analysis.Analyzer;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -422,6 +426,35 @@ public abstract class AbstractJob extend
return argMap.containsKey(keyFor(optionName));
}
+
+ /**
+ * Get the cardinality of the input vectors
+ *
+ * @param matrix
+ * @return the cardinality of the vector
+ */
+ public int getDimensions(Path matrix) throws IOException, InstantiationException, IllegalAccessException {
+
+ SequenceFile.Reader reader = null;
+ try {
+ reader = new SequenceFile.Reader(FileSystem.get(getConf()), matrix, getConf());
+
+ Writable row = (Writable) reader.getKeyClass().newInstance();
+ VectorWritable vectorWritable = new VectorWritable();
+
+ Preconditions.checkArgument(reader.getValueClass().equals(VectorWritable.class),
+ "value type of sequencefile must be a VectorWritable");
+
+ boolean hasAtLeastOneRow = reader.next(row, vectorWritable);
+ Preconditions.checkState(hasAtLeastOneRow, "matrix must have at least one row");
+
+ return vectorWritable.get().size();
+
+ } finally {
+ Closeables.closeQuietly(reader);
+ }
+ }
+
/** Obtain input and output directories from command-line options or hadoop
* properties. If {@code addInputOption} or {@code addOutputOption}
* has been called, this method will throw an {@code OptionException} if
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java?rev=1335732&r1=1335731&r2=1335732&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java Tue May 8 20:07:05 2012
@@ -20,8 +20,10 @@ package org.apache.mahout.math.hadoop.si
import com.google.common.base.Preconditions;
import com.google.common.primitives.Ints;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
@@ -79,7 +81,7 @@ public class RowSimilarityJob extends Ab
addInputOption();
addOutputOption();
- addOption("numberOfColumns", "r", "Number of columns in the input matrix");
+ addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use "
+ "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: "
@@ -93,7 +95,16 @@ public class RowSimilarityJob extends Ab
return -1;
}
- int numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
+ int numberOfColumns;
+
+ if (hasOption("numberOfColumns")) {
+ // Number of columns explicitly specified via CLI
+ numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
+ } else {
+ // else get the number of columns by determining the cardinality of a vector in the input matrix
+ numberOfColumns = getDimensions(getInputPath());
+ }
+
String similarityClassnameArg = getOption("similarityClassname");
String similarityClassname;
try {
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java?rev=1335732&r1=1335731&r2=1335732&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java Tue May 8 20:07:05 2012
@@ -178,4 +178,26 @@ public class RowSimilarityJobTest extend
assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
}
+ @Test
+ public void testVectorDimensions() throws Exception {
+
+ File inputFile = getTestTempFile("rows");
+
+ Configuration conf = new Configuration();
+ Path inputPath = new Path(inputFile.getAbsolutePath());
+ FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+ MathHelper.writeDistributedRowMatrix(new double[][] {
+ new double[] { 1, 0, 1, 1, 0, 1 },
+ new double[] { 0, 1, 1, 1, 1, 1 },
+ new double[] { 1, 1, 0, 1, 0, 0 } },
+ fs, conf, inputPath);
+
+ RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+ rowSimilarityJob.setConf(conf);
+
+ int numberOfColumns = rowSimilarityJob.getDimensions(inputPath);
+
+ assertEquals(6, numberOfColumns);
+ }
}