You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2012/05/08 22:07:06 UTC

svn commit: r1335732 - in /mahout/trunk/core/src: main/java/org/apache/mahout/common/ main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/ test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/

Author: ssc
Date: Tue May  8 20:07:05 2012
New Revision: 1335732

URL: http://svn.apache.org/viewvc?rev=1335732&view=rev
Log:
MAHOUT-979 RowSimilarityJob should be able to infer the number of columns from the input matrix if not specified

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java?rev=1335732&r1=1335731&r2=1335732&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/common/AbstractJob.java Tue May  8 20:07:05 2012
@@ -27,6 +27,7 @@ import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import com.google.common.io.Closeables;
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
 import org.apache.commons.cli2.Option;
@@ -39,6 +40,7 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.Job;
@@ -51,6 +53,8 @@ import org.apache.hadoop.mapreduce.lib.o
 import org.apache.hadoop.util.Tool;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.mahout.common.commandline.DefaultOptionCreator;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.vectorizer.DefaultAnalyzer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -422,6 +426,35 @@ public abstract class AbstractJob extend
     return argMap.containsKey(keyFor(optionName));
   }
 
+
+  /**
+   * Get the cardinality of the input vectors
+   *
+   * @param matrix
+   * @return the cardinality of the vector
+   */
+  public int getDimensions(Path matrix) throws IOException, InstantiationException, IllegalAccessException {
+
+    SequenceFile.Reader reader = null;
+    try {
+      reader = new SequenceFile.Reader(FileSystem.get(getConf()), matrix, getConf());
+
+      Writable row = (Writable) reader.getKeyClass().newInstance();
+      VectorWritable vectorWritable = new VectorWritable();
+
+      Preconditions.checkArgument(reader.getValueClass().equals(VectorWritable.class),
+          "value type of sequencefile must be a VectorWritable");
+
+      boolean hasAtLeastOneRow = reader.next(row, vectorWritable);
+      Preconditions.checkState(hasAtLeastOneRow, "matrix must have at least one row");
+
+      return vectorWritable.get().size();
+
+    } finally {
+      Closeables.closeQuietly(reader);
+    }
+  }
+
   /** Obtain input and output directories from command-line options or hadoop
    *  properties. If {@code addInputOption} or {@code addOutputOption}
    *  has been called, this method will throw an {@code OptionException} if

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java?rev=1335732&r1=1335731&r2=1335732&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java Tue May  8 20:07:05 2012
@@ -20,8 +20,10 @@ package org.apache.mahout.math.hadoop.si
 import com.google.common.base.Preconditions;
 import com.google.common.primitives.Ints;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -79,7 +81,7 @@ public class RowSimilarityJob extends Ab
 
     addInputOption();
     addOutputOption();
-    addOption("numberOfColumns", "r", "Number of columns in the input matrix");
+    addOption("numberOfColumns", "r", "Number of columns in the input matrix", false);
     addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use "
         + "one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
     addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: "
@@ -93,7 +95,16 @@ public class RowSimilarityJob extends Ab
       return -1;
     }
 
-    int numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
+    int numberOfColumns;
+
+    if (hasOption("numberOfColumns")) {
+      // Number of columns explicitly specified via CLI
+      numberOfColumns = Integer.parseInt(getOption("numberOfColumns"));
+    } else {
+      // else get the number of columns by determining the cardinality of a vector in the input matrix
+      numberOfColumns = getDimensions(getInputPath());
+    }
+
     String similarityClassnameArg = getOption("similarityClassname");
     String similarityClassname;
     try {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java?rev=1335732&r1=1335731&r2=1335732&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJobTest.java Tue May  8 20:07:05 2012
@@ -178,4 +178,26 @@ public class RowSimilarityJobTest extend
     assertEquals(0.0, similarityMatrix.get(2, 2), EPSILON);
   }
 
+  @Test
+  public void testVectorDimensions() throws Exception {
+
+    File inputFile = getTestTempFile("rows");
+
+    Configuration conf = new Configuration();
+    Path inputPath = new Path(inputFile.getAbsolutePath());
+    FileSystem fs = FileSystem.get(inputPath.toUri(), conf);
+
+    MathHelper.writeDistributedRowMatrix(new double[][] {
+        new double[] { 1, 0, 1, 1, 0, 1 },
+        new double[] { 0, 1, 1, 1, 1, 1 },
+        new double[] { 1, 1, 0, 1, 0, 0 } },
+        fs, conf, inputPath);
+
+    RowSimilarityJob rowSimilarityJob = new RowSimilarityJob();
+    rowSimilarityJob.setConf(conf);
+
+    int numberOfColumns = rowSimilarityJob.getDimensions(inputPath);
+
+    assertEquals(6, numberOfColumns);
+  }
 }