You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/04/28 22:15:10 UTC

svn commit: r939074 - in /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity: AbstractSimilarity.java EuclideanDistanceSimilarity.java PearsonCorrelationSimilarity.java UncenteredCosineSimilarity.java

Author: srowen
Date: Wed Apr 28 20:15:09 2010
New Revision: 939074

URL: http://svn.apache.org/viewvc?rev=939074&view=rev
Log:
MAHOUT-387

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
      - copied, changed from r938784, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java?rev=939074&r1=939073&r2=939074&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java Wed Apr 28 20:15:09 2010
@@ -40,30 +40,23 @@ abstract class AbstractSimilarity implem
   private PreferenceTransform prefTransform;
   private SimilarityTransform similarityTransform;
   private final boolean weighted;
+  private final boolean centerData;
   private int cachedNumItems;
   private int cachedNumUsers;
   private final RefreshHelper refreshHelper;
   
   /**
    * <p>
-   * Creates a normal (unweighted) .
+   * Creates a possibly weighted AbstractSimilarity.
    * </p>
    */
-  AbstractSimilarity(DataModel dataModel) throws TasteException {
-    this(dataModel, Weighting.UNWEIGHTED);
-  }
-  
-  /**
-   * <p>
-   * Creates a possibly weighted .
-   * </p>
-   */
-  AbstractSimilarity(final DataModel dataModel, Weighting weighting) throws TasteException {
+  AbstractSimilarity(final DataModel dataModel, Weighting weighting, boolean centerData) throws TasteException {
     if (dataModel == null) {
       throw new IllegalArgumentException("dataModel is null");
     }
     this.dataModel = dataModel;
     this.weighted = weighting == Weighting.WEIGHTED;
+    this.centerData = centerData;
     this.cachedNumItems = dataModel.getNumItems();
     this.cachedNumUsers = dataModel.getNumUsers();
     this.refreshHelper = new RefreshHelper(new Callable<Object>() {
@@ -304,19 +297,23 @@ abstract class AbstractSimilarity implem
         yIndex = yPrefs.getUserID(yPrefIndex);
       }
     }
-    
-    // See comments above on these computations
-    double n = count;
-    double meanX = sumX / n;
-    double meanY = sumY / n;
-    // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
-    double centeredSumXY = sumXY - meanY * sumX;
-    // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
-    double centeredSumX2 = sumX2 - meanX * sumX;
-    // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
-    double centeredSumY2 = sumY2 - meanY * sumY;
-    
-    double result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+
+    double result;
+    if (centerData) {
+      // See comments above on these computations
+      double n = (double) count;
+      double meanX = sumX / n;
+      double meanY = sumY / n;
+      // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+      double centeredSumXY = sumXY - meanY * sumX;
+      // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+      double centeredSumX2 = sumX2 - meanX * sumX;
+      // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+      double centeredSumY2 = sumY2 - meanY * sumY;
+      result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+    } else {
+      result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+    }
     
     if (similarityTransform != null) {
       result = similarityTransform.transformSimilarity(itemID1, itemID2, result);

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java?rev=939074&r1=939073&r2=939074&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java Wed Apr 28 20:15:09 2010
@@ -36,17 +36,14 @@ public final class EuclideanDistanceSimi
    * @throws IllegalArgumentException if {@link DataModel} does not have preference values
    */
   public EuclideanDistanceSimilarity(DataModel dataModel) throws TasteException {
-    super(dataModel);
-    if (!dataModel.hasPreferenceValues()) {
-      throw new IllegalArgumentException("DataModel doesn't have preference values");
-    }
+    this(dataModel, Weighting.WEIGHTED);
   }
 
   /**
    * @throws IllegalArgumentException if {@link DataModel} does not have preference values
    */
   public EuclideanDistanceSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
-    super(dataModel, weighting);
+    super(dataModel, weighting, false);
     if (!dataModel.hasPreferenceValues()) {
       throw new IllegalArgumentException("DataModel doesn't have preference values");
     }

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java?rev=939074&r1=939073&r2=939074&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java Wed Apr 28 20:15:09 2010
@@ -46,10 +46,14 @@ import org.apache.mahout.cf.taste.model.
  * </p>
  * 
  * <p>
- * This correlation implementation is equivalent to the cosine measure correlation since the data it receives
+ * This correlation implementation is equivalent to the cosine similarity since the data it receives
  * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle
  * between the two vectors defined by the users' preference values.
  * </p>
+ *
+ * <p>
+ * For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}.
+ * </p> 
  */
 public final class PearsonCorrelationSimilarity extends AbstractSimilarity {
 
@@ -57,17 +61,14 @@ public final class PearsonCorrelationSim
    * @throws IllegalArgumentException if {@link DataModel} does not have preference values
    */
   public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException {
-    super(dataModel);
-    if (!dataModel.hasPreferenceValues()) {
-      throw new IllegalArgumentException("DataModel doesn't have preference values");
-    }
+    this(dataModel, Weighting.WEIGHTED);
   }
 
   /**
    * @throws IllegalArgumentException if {@link DataModel} does not have preference values
    */
   public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
-    super(dataModel, weighting);
+    super(dataModel, weighting, true);
     if (!dataModel.hasPreferenceValues()) {
       throw new IllegalArgumentException("DataModel doesn't have preference values");
     }

Copied: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java (from r938784, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java?p2=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java&p1=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java&r1=938784&r2=939074&rev=939074&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java Wed Apr 28 20:15:09 2010
@@ -23,63 +23,40 @@ import org.apache.mahout.cf.taste.model.
 
 /**
  * <p>
- * An implementation of the Pearson correlation. For users X and Y, the following values are calculated:
+ * An implementation of the cosine similarity. The result is the cosine of the angle formed between
+ * the two preference vectors.
  * </p>
- * 
- * <ul>
- * <li>sumX2: sum of the square of all X's preference values</li>
- * <li>sumY2: sum of the square of all Y's preference values</li>
- * <li>sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a
- * preference</li>
- * </ul>
- * 
+ *
  * <p>
- * The correlation is then:
- * 
- * <p>
- * <code>sumXY / sqrt(sumX2 * sumY2)</code>
- * </p>
- * 
- * <p>
- * Note that this correlation "centers" its data, shifts the user's preference values so that each of their
- * means is 0. This is necessary to achieve expected behavior on all data sets.
- * </p>
- * 
- * <p>
- * This correlation implementation is equivalent to the cosine measure correlation since the data it receives
- * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle
- * between the two vectors defined by the users' preference values.
+ * Note that this similarity does not "center" its data, shifts the user's preference values so that each of their
+ * means is 0. For this behavior, use {@link PearsonCorrelationSimilarity}, which actually is mathematically
+ * equivalent for centered data.
  * </p>
  */
-public final class PearsonCorrelationSimilarity extends AbstractSimilarity {
+public final class UncenteredCosineSimilarity extends AbstractSimilarity {
 
   /**
    * @throws IllegalArgumentException if {@link DataModel} does not have preference values
    */
-  public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException {
-    super(dataModel);
-    if (!dataModel.hasPreferenceValues()) {
-      throw new IllegalArgumentException("DataModel doesn't have preference values");
-    }
+  public UncenteredCosineSimilarity(DataModel dataModel) throws TasteException {
+    this(dataModel, Weighting.WEIGHTED);
   }
 
   /**
    * @throws IllegalArgumentException if {@link DataModel} does not have preference values
    */
-  public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
-    super(dataModel, weighting);
+  public UncenteredCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+    super(dataModel, weighting, false);
     if (!dataModel.hasPreferenceValues()) {
       throw new IllegalArgumentException("DataModel doesn't have preference values");
     }
   }
-  
+
   @Override
   double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
     if (n == 0) {
       return Double.NaN;
     }
-    // Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
-    // the data is assumed to be centered.
     double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
     if (denominator == 0.0) {
       // One or both parties has -all- the same ratings;
@@ -88,5 +65,5 @@ public final class PearsonCorrelationSim
     }
     return sumXY / denominator;
   }
-  
-}
+
+}
\ No newline at end of file