You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/10/21 11:22:38 UTC

svn commit: r1187238 - in /mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/impl/similarity/ main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/ test/java/org/apache/mahout/cf/taste/impl/similarity/ test/java/org/apache...

Author: srowen
Date: Fri Oct 21 09:22:38 2011
New Revision: 1187238

URL: http://svn.apache.org/viewvc?rev=1187238&view=rev
Log:
MAHOUT-847 better Euclidean distance similarity metric

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java
    mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java Fri Oct 21 09:22:38 2011
@@ -28,9 +28,19 @@ import com.google.common.base.Preconditi
  * An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking
  * of items as dimensions and preferences as points along those dimensions, a distance is computed using all
  * items (dimensions) where both users have expressed a preference for that item. This is simply the square
- * root of the sum of the squares of differences in position (preference) along each dimension. The similarity
- * is then computed as 1 / (1 + distance), so the resulting values are in the range (0,1].
- * </p>
+ * root of the sum of the squares of differences in position (preference) along each dimension.</p>
+ * 
+ * <p>The similarity could be computed as 1 / (1 + distance), so the resulting values are in the range (0,1].
+ * This would weight against pairs that overlap in more dimensions, which should indicate more similarity, 
+ * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as 
+ * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this.
+ * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).</p>
+ *
+ * <p>Note that this could cause a similarity to exceed 1; such values are capped at 1.</p>
+ * 
+ * <p>Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from
+ * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as
+ * it doesn't change ordering.</p>
  */
 public final class EuclideanDistanceSimilarity extends AbstractSimilarity {
 
@@ -51,8 +61,7 @@ public final class EuclideanDistanceSimi
   
   @Override
   double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
-    // divide denominator by n below to not automatically give users with more overlap more similarity
-    return n / (1.0 + Math.sqrt(sumXYdiff2));
+    return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n));
   }
   
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java Fri Oct 21 09:22:38 2011
@@ -47,7 +47,7 @@ public class EuclideanDistanceSimilarity
   @Override
   public double similarity(double dots, double normA, double normB, int numberOfColumns) {
     double euclideanDistance = Math.sqrt(normA - 2 * dots + normB);
-    return 1.0 - 1.0 / (1.0 + euclideanDistance);
+    return 1.0 / (1.0 + euclideanDistance);
   }
 
   @Override

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java Fri Oct 21 09:22:38 2011
@@ -71,7 +71,7 @@ public final class EuclideanDistanceSimi
                     {-3.0, 2.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
-    assertCorrelationEquals(0.24357264905599915, correlation);
+    assertCorrelationEquals(0.1639607805437114, correlation);
   }
 
   @Test
@@ -83,7 +83,7 @@ public final class EuclideanDistanceSimi
                     {-3.0, 2.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel, Weighting.WEIGHTED).userSimilarity(1, 2);
-    assertCorrelationEquals(0.747857549685333, correlation);
+    assertCorrelationEquals(0.7213202601812372, correlation);
   }
 
   @Test
@@ -95,7 +95,7 @@ public final class EuclideanDistanceSimi
                     {null, null, 1.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
-    assertEquals(0.0, correlation, EPSILON);
+    assertTrue(Double.isNaN(correlation));
   }
 
   @Test
@@ -107,7 +107,7 @@ public final class EuclideanDistanceSimi
                     {70.0, 80.0, 90.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
-    assertCorrelationEquals(0.10244407226831752, correlation);
+    assertCorrelationEquals(0.05770363219029305, correlation);
   }
 
   @Test
@@ -119,7 +119,7 @@ public final class EuclideanDistanceSimi
                     {2.0, 5.0, 6.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
-    assertCorrelationEquals(0.5598164905901122, correlation);
+    assertCorrelationEquals(0.2843646522044218, correlation);
   }
 
   @Test
@@ -131,7 +131,7 @@ public final class EuclideanDistanceSimi
                     {2.0, 5.0, 6.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel, Weighting.WEIGHTED).userSimilarity(1, 2);
-    assertCorrelationEquals(0.889954122647528, correlation);
+    assertCorrelationEquals(0.8210911630511055, correlation);
   }
 
   @Test
@@ -170,7 +170,7 @@ public final class EuclideanDistanceSimi
             });
     double correlation =
         new EuclideanDistanceSimilarity(dataModel).itemSimilarity(0, 1);
-    assertCorrelationEquals(0.24357264905599915, correlation);
+    assertCorrelationEquals(0.1639607805437114, correlation);
   }
 
   @Test
@@ -182,7 +182,7 @@ public final class EuclideanDistanceSimi
                     {null, null, 1.0},
             });
     double correlation = new EuclideanDistanceSimilarity(dataModel).itemSimilarity(1, 2);
-    assertEquals(0.0, correlation, EPSILON);
+    assertTrue(Double.isNaN(correlation));
   }
 
   @Test
@@ -196,7 +196,7 @@ public final class EuclideanDistanceSimi
             });
     double correlation =
         new EuclideanDistanceSimilarity(dataModel).itemSimilarity(0, 1);
-    assertCorrelationEquals(0.10244407226831752, correlation);
+    assertCorrelationEquals(0.05770363219029305, correlation);
   }
 
   @Test
@@ -210,7 +210,7 @@ public final class EuclideanDistanceSimi
             });
     double correlation =
         new EuclideanDistanceSimilarity(dataModel).itemSimilarity(0, 1);
-    assertCorrelationEquals(0.5598164905901122, correlation);
+    assertCorrelationEquals(0.2843646522044218, correlation);
   }
 
   @Test
@@ -224,7 +224,7 @@ public final class EuclideanDistanceSimi
             });
     ItemSimilarity itemSimilarity = new EuclideanDistanceSimilarity(dataModel, Weighting.WEIGHTED);
     double correlation = itemSimilarity.itemSimilarity(0, 1);
-    assertCorrelationEquals(0.889954122647528, correlation);
+    assertCorrelationEquals(0.8210911630511055, correlation);
   }
 
   @Test

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java Fri Oct 21 09:22:38 2011
@@ -115,6 +115,6 @@ public class VectorSimilarityMeasuresTes
         new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 },
         new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 4 }, EuclideanDistanceSimilarity.class);
 
-    assertEquals(0.887311346, similarity, EPSILON);
+    assertEquals(0.11268865367232477, similarity, EPSILON);
   }
 }