You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2011/10/21 11:22:38 UTC
svn commit: r1187238 - in /mahout/trunk/core/src:
main/java/org/apache/mahout/cf/taste/impl/similarity/
main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/
test/java/org/apache/mahout/cf/taste/impl/similarity/ test/java/org/apache...
Author: srowen
Date: Fri Oct 21 09:22:38 2011
New Revision: 1187238
URL: http://svn.apache.org/viewvc?rev=1187238&view=rev
Log:
MAHOUT-847 better Euclidean distance similarity metric
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java Fri Oct 21 09:22:38 2011
@@ -28,9 +28,19 @@ import com.google.common.base.Preconditi
* An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking
* of items as dimensions and preferences as points along those dimensions, a distance is computed using all
* items (dimensions) where both users have expressed a preference for that item. This is simply the square
- * root of the sum of the squares of differences in position (preference) along each dimension. The similarity
- * is then computed as 1 / (1 + distance), so the resulting values are in the range (0,1].
- * </p>
+ * root of the sum of the squares of differences in position (preference) along each dimension.</p>
+ *
+ * <p>The similarity could be computed as 1 / (1 + distance), so the resulting values are in the range (0,1].
+ * This would weight against pairs that overlap in more dimensions, which should indicate more similarity,
+ * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as
+ * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this.
+ * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).</p>
+ *
+ * <p>Note that this could cause a similarity to exceed 1; such values are capped at 1.</p>
+ *
+ * <p>Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from
+ * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as
+ * it doesn't change ordering.</p>
*/
public final class EuclideanDistanceSimilarity extends AbstractSimilarity {
@@ -51,8 +61,7 @@ public final class EuclideanDistanceSimi
@Override
double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
- // divide denominator by n below to not automatically give users with more overlap more similarity
- return n / (1.0 + Math.sqrt(sumXYdiff2));
+ return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n));
}
}
Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/EuclideanDistanceSimilarity.java Fri Oct 21 09:22:38 2011
@@ -47,7 +47,7 @@ public class EuclideanDistanceSimilarity
@Override
public double similarity(double dots, double normA, double normB, int numberOfColumns) {
double euclideanDistance = Math.sqrt(normA - 2 * dots + normB);
- return 1.0 - 1.0 / (1.0 + euclideanDistance);
+ return 1.0 / (1.0 + euclideanDistance);
}
@Override
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarityTest.java Fri Oct 21 09:22:38 2011
@@ -71,7 +71,7 @@ public final class EuclideanDistanceSimi
{-3.0, 2.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
- assertCorrelationEquals(0.24357264905599915, correlation);
+ assertCorrelationEquals(0.1639607805437114, correlation);
}
@Test
@@ -83,7 +83,7 @@ public final class EuclideanDistanceSimi
{-3.0, 2.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel, Weighting.WEIGHTED).userSimilarity(1, 2);
- assertCorrelationEquals(0.747857549685333, correlation);
+ assertCorrelationEquals(0.7213202601812372, correlation);
}
@Test
@@ -95,7 +95,7 @@ public final class EuclideanDistanceSimi
{null, null, 1.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
- assertEquals(0.0, correlation, EPSILON);
+ assertTrue(Double.isNaN(correlation));
}
@Test
@@ -107,7 +107,7 @@ public final class EuclideanDistanceSimi
{70.0, 80.0, 90.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
- assertCorrelationEquals(0.10244407226831752, correlation);
+ assertCorrelationEquals(0.05770363219029305, correlation);
}
@Test
@@ -119,7 +119,7 @@ public final class EuclideanDistanceSimi
{2.0, 5.0, 6.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel).userSimilarity(1, 2);
- assertCorrelationEquals(0.5598164905901122, correlation);
+ assertCorrelationEquals(0.2843646522044218, correlation);
}
@Test
@@ -131,7 +131,7 @@ public final class EuclideanDistanceSimi
{2.0, 5.0, 6.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel, Weighting.WEIGHTED).userSimilarity(1, 2);
- assertCorrelationEquals(0.889954122647528, correlation);
+ assertCorrelationEquals(0.8210911630511055, correlation);
}
@Test
@@ -170,7 +170,7 @@ public final class EuclideanDistanceSimi
});
double correlation =
new EuclideanDistanceSimilarity(dataModel).itemSimilarity(0, 1);
- assertCorrelationEquals(0.24357264905599915, correlation);
+ assertCorrelationEquals(0.1639607805437114, correlation);
}
@Test
@@ -182,7 +182,7 @@ public final class EuclideanDistanceSimi
{null, null, 1.0},
});
double correlation = new EuclideanDistanceSimilarity(dataModel).itemSimilarity(1, 2);
- assertEquals(0.0, correlation, EPSILON);
+ assertTrue(Double.isNaN(correlation));
}
@Test
@@ -196,7 +196,7 @@ public final class EuclideanDistanceSimi
});
double correlation =
new EuclideanDistanceSimilarity(dataModel).itemSimilarity(0, 1);
- assertCorrelationEquals(0.10244407226831752, correlation);
+ assertCorrelationEquals(0.05770363219029305, correlation);
}
@Test
@@ -210,7 +210,7 @@ public final class EuclideanDistanceSimi
});
double correlation =
new EuclideanDistanceSimilarity(dataModel).itemSimilarity(0, 1);
- assertCorrelationEquals(0.5598164905901122, correlation);
+ assertCorrelationEquals(0.2843646522044218, correlation);
}
@Test
@@ -224,7 +224,7 @@ public final class EuclideanDistanceSimi
});
ItemSimilarity itemSimilarity = new EuclideanDistanceSimilarity(dataModel, Weighting.WEIGHTED);
double correlation = itemSimilarity.itemSimilarity(0, 1);
- assertCorrelationEquals(0.889954122647528, correlation);
+ assertCorrelationEquals(0.8210911630511055, correlation);
}
@Test
Modified: mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java?rev=1187238&r1=1187237&r2=1187238&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/measures/VectorSimilarityMeasuresTest.java Fri Oct 21 09:22:38 2011
@@ -115,6 +115,6 @@ public class VectorSimilarityMeasuresTes
new double[] { 0, 2, 0, 0, 8, 3, 0, 6, 0, 1, 1, 2, 1 },
new double[] { 3, 0, 0, 0, 7, 0, 2, 2, 1, 3, 2, 4, 4 }, EuclideanDistanceSimilarity.class);
- assertEquals(0.887311346, similarity, EPSILON);
+ assertEquals(0.11268865367232477, similarity, EPSILON);
}
}