You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/07/12 13:54:40 UTC

svn commit: r963249 - in /mahout/trunk: conf/ core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ core/src/main/java/org/apache/mahout/math/hadoop/similarity/ core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/

Author: srowen
Date: Mon Jul 12 11:54:39 2010
New Revision: 963249

URL: http://svn.apache.org/viewvc?rev=963249&view=rev
Log:
MAHOUT-440

Added:
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java
Modified:
    mahout/trunk/conf/driver.classes.props
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java

Modified: mahout/trunk/conf/driver.classes.props
URL: http://svn.apache.org/viewvc/mahout/trunk/conf/driver.classes.props?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/conf/driver.classes.props (original)
+++ mahout/trunk/conf/driver.classes.props Mon Jul 12 11:54:39 2010
@@ -20,3 +20,5 @@ org.apache.mahout.classifier.bayes.TestC
 org.apache.mahout.classifier.bayes.TrainClassifier = trainclassifier : Train Bayes Classifier
 org.apache.mahout.math.hadoop.decomposer.DistributedLanczosSolver = svd : Lanczos Singular Value Decomposition
 org.apache.mahout.math.hadoop.decomposer.EigenVerificationJob = cleansvd : Cleanup and verification of SVD output
+org.apache.mahout.math.hadoop.similarity.RowSimilarityJob = rowsimilarity : Compute the pairwise similarities of the rows of a matrix
+org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob = itemsimilarity : Compute the item-item-similarities for item-based collaborative filtering

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Mon Jul 12 11:54:39 2010
@@ -17,18 +17,12 @@
 
 package org.apache.mahout.cf.taste.hadoop.similarity.item;
 
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.nio.charset.Charset;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DoubleWritable;
-import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.mapreduce.Job;
@@ -47,6 +41,7 @@ import org.apache.mahout.math.VarLongWri
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.hadoop.DistributedRowMatrix;
 import org.apache.mahout.math.hadoop.similarity.RowSimilarityJob;
+import org.apache.mahout.math.hadoop.similarity.SimilarityType;
 
 public final class ItemSimilarityJob extends AbstractJob {
 
@@ -64,7 +59,8 @@ public final class ItemSimilarityJob ext
 
     addInputOption();
     addOutputOption();
-    addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate");
+    addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " +
+        "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
     addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " +
         "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
 
@@ -127,7 +123,7 @@ public final class ItemSimilarityJob ext
       itemUserMatrix.waitForCompletion(true);
     }
 
-    int numberOfUsers = readNumberOfUsers(getConf(), countUsersPath);
+    int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath);
 
     /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
      * new DistributedRowMatrix(...).rowSimilarity(...) */
@@ -156,18 +152,4 @@ public final class ItemSimilarityJob ext
 
     return 0;
   }
-
-  static int readNumberOfUsers(Configuration conf, Path outputDir) throws IOException {
-    FileSystem fs = FileSystem.get(conf);
-    Path outputFile = fs.listStatus(outputDir, TasteHadoopUtils.PARTS_FILTER)[0].getPath();
-    InputStream in = null;
-    try  {
-      in = fs.open(outputFile);
-      ByteArrayOutputStream out = new ByteArrayOutputStream();
-      IOUtils.copyBytes(in, out, conf);
-      return Integer.parseInt(new String(out.toByteArray(), Charset.forName("UTF-8")).trim());
-    } finally {
-      IOUtils.closeStream(in);
-    }
-  }
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/RowSimilarityJob.java Mon Jul 12 11:54:39 2010
@@ -91,7 +91,8 @@ public class RowSimilarityJob extends Ab
     addInputOption();
     addOutputOption();
     addOption("numberOfColumns", "r", "Number of columns in the input matrix");
-    addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate");
+    addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + 
+        "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')');
     addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: "
               + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
 
@@ -101,7 +102,15 @@ public class RowSimilarityJob extends Ab
     }
 
     int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns"));
-    String distributedSimilarityClassname = parsedArgs.get("--similarityClassname");
+    String similarityClassnameArg = parsedArgs.get("--similarityClassname");
+    String distributedSimilarityClassname;
+    try {
+      distributedSimilarityClassname =
+          SimilarityType.valueOf(similarityClassnameArg).getSimilarityImplementationClassName();
+    } catch (IllegalArgumentException iae) {
+      distributedSimilarityClassname = similarityClassnameArg;
+    }
+
     int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow"));
 
     Path inputPath = getInputPath();
@@ -341,4 +350,4 @@ public class RowSimilarityJob extends Ab
     }
   }
 
-}
\ No newline at end of file
+}

Added: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java?rev=963249&view=auto
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java (added)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/SimilarityType.java Mon Jul 12 11:54:39 2010
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.math.hadoop.similarity;
+
+import java.util.Arrays;
+
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedCooccurrenceVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedEuclideanDistanceVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedLoglikelihoodVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedPearsonCorrelationVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedUncenteredCosineVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedUncenteredZeroAssumingCosineVectorSimilarity;
+import org.apache.mahout.math.hadoop.similarity.vector.DistributedVectorSimilarity;
+
+public enum SimilarityType {
+
+  SIMILARITY_COOCCURRENCE(DistributedCooccurrenceVectorSimilarity.class),
+  SIMILARITY_EUCLIDEAN_DISTANCE(DistributedEuclideanDistanceVectorSimilarity.class),
+  SIMILARITY_LOGLIKELIHOOD(DistributedLoglikelihoodVectorSimilarity.class),
+  SIMILARITY_PEARSON_CORRELATION(DistributedPearsonCorrelationVectorSimilarity.class),
+  SIMILARITY_TANIMOTO_COEFFICIENT(DistributedTanimotoCoefficientVectorSimilarity.class),
+  SIMILARITY_UNCENTERED_COSINE(DistributedUncenteredCosineVectorSimilarity.class),
+  SIMILARITY_UNCENTERED_ZERO_ASSUMING_COSINE(DistributedUncenteredZeroAssumingCosineVectorSimilarity.class);
+
+  private final Class<? extends DistributedVectorSimilarity> similarityImplementation;
+
+  SimilarityType(Class<? extends DistributedVectorSimilarity> similarityImplementation) {
+    this.similarityImplementation = similarityImplementation;
+  }
+
+  public String getSimilarityImplementationClassName() {
+    return similarityImplementation.getName();
+  }
+
+  public static String listEnumNames() {
+    return Arrays.toString(values());
+  }
+
+}

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=963249&r1=963248&r2=963249&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java Mon Jul 12 11:54:39 2010
@@ -41,7 +41,6 @@ import org.apache.mahout.math.VarIntWrit
 import org.apache.mahout.math.VarLongWritable;
 import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
-import org.apache.mahout.math.hadoop.DistributedRowMatrix;
 import org.apache.mahout.math.hadoop.MathHelper;
 import org.apache.mahout.math.hadoop.DistributedRowMatrix.MatrixEntryWritable;
 import org.apache.mahout.math.hadoop.similarity.vector.DistributedTanimotoCoefficientVectorSimilarity;
@@ -253,8 +252,8 @@ public final class ItemSimilarityTest ex
        DistributedUncenteredZeroAssumingCosineVectorSimilarity.class.getName() });
 
     File countUsersPart = new File(tmpDir, "countUsers");
-    int numberOfUsers = ItemSimilarityJob.readNumberOfUsers(new Configuration(),
-                                                            new Path(countUsersPart.getAbsolutePath()));
+    int numberOfUsers = TasteHadoopUtils.readIntFromFile(new Configuration(),
+        new Path(countUsersPart.getAbsolutePath()));
 
     assertEquals(3, numberOfUsers);