You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/05/09 15:36:39 UTC

svn commit: r942537 - in /lucene/mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/hadoop/similarity/ main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ test/java/org/apache/mahout/cf/taste/hadoop/similarity/ test/java/org/apache/ma...

Author: srowen
Date: Sun May  9 13:36:38 2010
New Revision: 942537

URL: http://svn.apache.org/viewvc?rev=942537&view=rev
Log:
MAHOUT-393

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java
Removed:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedSimilarity.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedSimilarityTest.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java
    lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java Sun May  9 13:36:38 2010
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+/**
+ * abstract base class for all implementations of {@link DistributedItemSimilarity} that does not give a
+ * weight to item vectors and only ensures that the result is within [-1,1]
+ */
+public abstract class AbstractDistributedItemSimilarity
+    implements DistributedItemSimilarity {
+
+  @Override
+  public final double similarity(Iterator<CoRating> coratings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers) {
+
+    double result = doComputeResult(coratings, weightOfItemVectorX, weightOfItemVectorY, numberOfUsers);
+
+    if (result < -1.0) {
+      result = -1.0;
+    } else if (result > 1.0) {
+      result = 1.0;
+    }
+    return result;
+  }
+
+  /**
+   * do not compute a weight by default, subclasses can override this
+   * when they need a weight
+   */
+  @Override
+  public double weightOfItemVector(Iterator<Float> prefValues) {
+    return Double.NaN;
+  }
+
+  protected abstract double doComputeResult(Iterator<CoRating> coratings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers);
+}

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java Sun May  9 13:36:38 2010
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
+
+/**
+ * distributed version of {@link EuclideanDistanceSimilarity}
+ */
+public class DistributedEuclideanDistanceSimilarity extends AbstractDistributedItemSimilarity {
+
+  @Override
+  protected double doComputeResult(Iterator<CoRating> coratings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers) {
+
+    double n=0;
+    double sumXYdiff2 = 0;
+
+    while (coratings.hasNext()) {
+      CoRating coRating = coratings.next();
+      double diff = coRating.getPrefValueX() - coRating.getPrefValueY();
+      sumXYdiff2 += diff * diff;
+      n++;
+    }
+
+    return (n / (1.0 + Math.sqrt(sumXYdiff2)));
+  }
+}

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java Sun May  9 13:36:38 2010
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+/**
+ * Modelling the pairwise similarity computation of items in a distributed manner
+ */
+public interface DistributedItemSimilarity {
+
+  /**
+   * compute the weight of an item vector (called in an early stage of the map-reduce steps)
+   *
+   * @param prefValues
+   * @return
+   */
+  double weightOfItemVector(Iterator<Float> prefValues);
+
+  /**
+   * compute the similarity for a pair of item-vectors
+   *
+   * @param coratings all coratings for these items
+   * @param weightOfItemVectorX the weight computed for the first vector
+   * @param weightOfItemVectorY the weight computed for the second vector
+   * @param numberOfUsers the overall number of users
+   * @return
+   */
+  double similarity(Iterator<CoRating> coratings,
+                    double weightOfItemVectorX,
+                    double weightOfItemVectorY,
+                    int numberOfUsers);
+}

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java Sun May  9 13:36:38 2010
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+
+/**
+ * Distributed version of {@link LogLikelihoodSimilarity}
+ */
+public class DistributedLogLikelihoodSimilarity extends AbstractDistributedItemSimilarity {
+
+  @Override
+  protected double doComputeResult(Iterator<CoRating> coratings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers) {
+
+    int preferringXandY = 0;
+    while (coratings.hasNext()) {
+      coratings.next();
+      preferringXandY++;
+    }
+
+    if (preferringXandY == 0) {
+      return Double.NaN;
+    }
+
+    int preferringX = (int) weightOfItemVectorX;
+    int preferringY = (int) weightOfItemVectorY;
+
+    double logLikelihood = twoLogLambda(preferringXandY,
+                                        preferringX - preferringXandY,
+                                        preferringY,
+                                        numberOfUsers - preferringY);
+
+    return 1.0 - 1.0 / (1.0 + logLikelihood);
+  }
+
+  @Override
+  public double weightOfItemVector(Iterator<Float> prefValues) {
+    double nonZeroEntries = 0;
+    while (prefValues.hasNext()) {
+      prefValues.next();
+      nonZeroEntries++;
+    }
+    return nonZeroEntries;
+  }
+
+  private static double twoLogLambda(double k1, double k2, double n1, double n2) {
+    double p = (k1 + k2) / (n1 + n2);
+    return 2.0 * (logL(k1 / n1, k1, n1)
+                  + logL(k2 / n2, k2, n2)
+                  - logL(p, k1, n1)
+                  - logL(p, k2, n2));
+  }
+
+  private static double logL(double p, double k, double n) {
+    return k * safeLog(p) + (n - k) * safeLog(1.0 - p);
+  }
+
+  private static double safeLog(double d) {
+    return d <= 0.0 ? 0.0 : Math.log(d);
+  }
+
+}

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java Sun May  9 13:36:38 2010
@@ -19,10 +19,17 @@ package org.apache.mahout.cf.taste.hadoo
 
 import java.util.Iterator;
 
-public class DistributedPearsonCorrelationSimilarity implements DistributedSimilarity {
+import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity;
+
+/**
+ * Distributed version of {@link PearsonCorrelationSimilarity}
+ */
+public class DistributedPearsonCorrelationSimilarity extends AbstractDistributedItemSimilarity {
 
   @Override
-  public double similarity(Iterator<CoRating> coRatings, double weightOfItemVectorX, double weightOfItemVectorY) {
+  protected double doComputeResult(Iterator<CoRating> coRatings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers) {
 
     int count = 0;
     double sumX = 0.0;
@@ -68,10 +75,4 @@ public class DistributedPearsonCorrelati
 
     return centeredSumXY / denominator;
   }
-
-  @Override
-  public double weightOfItemVector(Iterator<Float> prefValues) {
-    return Double.NaN;
-  }
-
 }

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java Sun May  9 13:36:38 2010
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.TanimotoCoefficientSimilarity;
+
+/**
+ * Distributed version of {@link TanimotoCoefficientSimilarity}
+ */
+public class DistributedTanimotoCoefficientSimilarity extends AbstractDistributedItemSimilarity {
+
+	@Override
+	protected double doComputeResult(Iterator<CoRating> coratings,
+			double weightOfItemVectorX, double weightOfItemVectorY,
+			int numberOfUsers) {
+
+	  double preferringXAndY = 0;
+	  while (coratings.hasNext()) {
+	    coratings.next();
+	    preferringXAndY++;
+	  }
+
+	  if (preferringXAndY == 0) {
+	    return Double.NaN;
+	  }
+
+	  return (preferringXAndY / (weightOfItemVectorX + weightOfItemVectorY - preferringXAndY));
+	}
+
+	@Override
+	public double weightOfItemVector(Iterator<Float> prefValues) {
+		double nonZeroEntries = 0;
+		while (prefValues.hasNext()) {
+		  prefValues.next();
+		  nonZeroEntries++;
+		}
+		return nonZeroEntries;
+	}
+
+}

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java Sun May  9 13:36:38 2010
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
+
+/**
+ * Distributed version of {@link UncenteredCosineSimilarity}
+ */
+public class DistributedUncenteredCosineSimilarity extends AbstractDistributedItemSimilarity {
+
+  @Override
+  protected double doComputeResult(Iterator<CoRating> coratings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers) {
+
+    int n = 0;
+    double sumXY = 0;
+    double sumX2 = 0;
+    double sumY2 = 0;
+
+    while (coratings.hasNext()) {
+      CoRating coRating = coratings.next();
+      double x = coRating.getPrefValueX();
+      double y = coRating.getPrefValueY();
+
+      sumXY += x * y;
+      sumX2 += x * x;
+      sumY2 += y * y;
+      n++;
+    }
+
+    if (n == 0) {
+      return Double.NaN;
+    }
+    double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+    if (denominator == 0.0) {
+      // One or both parties has -all- the same ratings;
+      // can't really say much similarity under this measure
+      return Double.NaN;
+    }
+    return sumXY / denominator;
+
+  }
+}

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java Sun May  9 13:36:38 2010
@@ -19,11 +19,17 @@ package org.apache.mahout.cf.taste.hadoo
 
 import java.util.Iterator;
 
+/**
+ * distributed cosine similarity that assumes that all unknown preferences
+ * are zeros and that does not center data
+ */
 public final class DistributedUncenteredZeroAssumingCosineSimilarity
-    implements DistributedSimilarity {
+    extends AbstractDistributedItemSimilarity {
 
   @Override
-  public double similarity(Iterator<CoRating> coRatings, double weightOfItemVectorX, double weightOfItemVectorY) {
+  protected double doComputeResult(Iterator<CoRating> coRatings,
+      double weightOfItemVectorX, double weightOfItemVectorY,
+      int numberOfUsers) {
 
     double sumXY = 0;
     while (coRatings.hasNext()) {

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java Sun May  9 13:36:38 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hadoop.io.VLongWritable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Partitioner;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * a writable key that is used by {@link CountUsersMapper} and {@link CountUsersReducer} to
+ * count unique users by sending all userIDs to the same reducer and have them sorted in
+ * ascending order so that there's no buffering necessary when counting them
+ */
+public class CountUsersKeyWritable implements WritableComparable<CountUsersKeyWritable> {
+
+  private long userID;
+
+  public CountUsersKeyWritable() {
+  }
+
+  public CountUsersKeyWritable(long userID) {
+    this.userID = userID;
+  }
+
+  public long getUserID() {
+    return userID;
+  }
+
+  @Override
+  public void readFields(DataInput in) throws IOException {
+    userID = WritableUtils.readVLong(in);
+  }
+
+  @Override
+  public void write(DataOutput out) throws IOException {
+    WritableUtils.writeVLong(out, userID);
+  }
+
+  @Override
+  public int compareTo(CountUsersKeyWritable other) {
+    return (userID == other.userID ? 0 : userID < other.userID ? -1 : 1);
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (!(other instanceof CountUsersKeyWritable)) {
+      return false;
+    }
+    return userID == ((CountUsersKeyWritable) other).userID;
+  }
+
+  @Override
+  public int hashCode() {
+    return RandomUtils.hashLong(userID);
+  }
+
+  /**
+   * all userIDs go to the same partition
+   */
+  public static class CountUsersPartitioner implements Partitioner<CountUsersKeyWritable,VLongWritable> {
+
+    @Override
+    public int getPartition(CountUsersKeyWritable key, VLongWritable value, int numPartitions) {
+      return 0;
+    }
+
+    @Override
+    public void configure(JobConf conf) {}
+  }
+
+  /**
+   * all userIDs go to the same reducer
+   */
+  public static class CountUsersGroupComparator extends WritableComparator implements Serializable {
+
+    public CountUsersGroupComparator() {
+      super(CountUsersKeyWritable.class, true);
+    }
+
+    @Override
+    public int compare(WritableComparable a, WritableComparable b) {
+      return 0;
+    }
+  }
+}

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java Sun May  9 13:36:38 2010
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VLongWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * maps out the userIDs in a way that we can use a secondary sort on them
+ */
+public class CountUsersMapper extends MapReduceBase
+    implements Mapper<LongWritable,Text,CountUsersKeyWritable,VLongWritable> {
+
+  private static final Pattern DELIMITER = Pattern.compile("[\t,]");
+
+  @Override
+  public void map(LongWritable arg0, Text value,
+      OutputCollector<CountUsersKeyWritable,VLongWritable> out, Reporter reporter)
+      throws IOException {
+
+    String[] tokens = DELIMITER.split(value.toString());
+    long userID = Long.parseLong(tokens[0]);
+
+    out.collect(new CountUsersKeyWritable(userID), new VLongWritable(userID));
+  }
+
+}

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java Sun May  9 13:36:38 2010
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.VLongWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * counts all unique users, we ensure that we see userIDs sorted in ascending order via
+ * secondary sort, so we don't have to buffer all of them
+ */
+public class CountUsersReducer extends MapReduceBase
+    implements Reducer<CountUsersKeyWritable,VLongWritable,IntWritable,NullWritable> {
+
+  @Override
+  public void reduce(CountUsersKeyWritable key, Iterator<VLongWritable> userIDs,
+      OutputCollector<IntWritable,NullWritable> out, Reporter reporter)
+      throws IOException {
+
+    long lastSeenUserID = Long.MIN_VALUE;
+    int numberOfUsers = 0;
+
+    while (userIDs.hasNext()) {
+      long currentUserID = userIDs.next().get();
+      if (currentUserID > lastSeenUserID) {
+        lastSeenUserID = currentUserID;
+        numberOfUsers++;
+      }
+    }
+    out.collect(new IntWritable(numberOfUsers), NullWritable.get());
+  }
+
+}

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Sun May  9 13:36:38 2010
@@ -17,12 +17,20 @@
 
 package org.apache.mahout.cf.taste.hadoop.similarity.item;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
 import java.util.Map;
 
 import org.apache.commons.cli2.Option;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.VLongWritable;
 import org.apache.hadoop.mapred.JobClient;
 import org.apache.hadoop.mapred.JobConf;
@@ -36,7 +44,7 @@ import org.apache.mahout.cf.taste.hadoop
 import org.apache.mahout.cf.taste.hadoop.EntityPrefWritableArrayWritable;
 import org.apache.mahout.cf.taste.hadoop.ToUserPrefsMapper;
 import org.apache.mahout.cf.taste.hadoop.similarity.CoRating;
-import org.apache.mahout.cf.taste.hadoop.similarity.DistributedSimilarity;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedItemSimilarity;
 import org.apache.mahout.common.AbstractJob;
 
 /**
@@ -87,7 +95,7 @@ import org.apache.mahout.common.Abstract
  * the form userID,itemID,preference
  * computed, one per line</li>
  * <li>-Dmapred.output.dir=(path): output path where the computations output should go</li>
- * <li>--similarityClassname (classname): an implemenation of {@link DistributedSimilarity} used to compute the
+ * <li>--similarityClassname (classname): an implemenation of {@link DistributedItemSimilarity} used to compute the
  * similarity</li>
  * </ol>
  *
@@ -103,6 +111,9 @@ public final class ItemSimilarityJob ext
   public static final String DISTRIBUTED_SIMILARITY_CLASSNAME =
     "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.distributedSimilarityClassname";
 
+  public static final String NUMBER_OF_USERS =
+    "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.numberOfUsers";
+
   @Override
   public int run(String[] args) throws IOException {
 
@@ -122,9 +133,32 @@ public final class ItemSimilarityJob ext
     String outputPath = originalConf.get("mapred.output.dir");
     String tempDirPath = parsedArgs.get("--tempDir");
 
+    String countUsersPath = tempDirPath + "/countUsers";
     String itemVectorsPath = tempDirPath + "/itemVectors";
     String userVectorsPath = tempDirPath + "/userVectors";
 
+    /* count all unique users */
+    JobConf countUsers = prepareJobConf(inputPath,
+                                         countUsersPath,
+                                         TextInputFormat.class,
+                                         CountUsersMapper.class,
+                                         CountUsersKeyWritable.class,
+                                         VLongWritable.class,
+                                         CountUsersReducer.class,
+                                         IntWritable.class,
+                                         NullWritable.class,
+                                         TextOutputFormat.class);
+
+    countUsers.setPartitionerClass(
+        CountUsersKeyWritable.CountUsersPartitioner.class);
+    countUsers.setOutputValueGroupingComparator(
+        CountUsersKeyWritable.CountUsersGroupComparator.class);
+
+    JobClient.runJob(countUsers);
+
+    int numberOfUsers =
+        readNumberOfUsers(countUsers, (countUsersPath + "/part-00000"));
+
     JobConf itemVectors = prepareJobConf(inputPath,
                                          itemVectorsPath,
                                          TextInputFormat.class,
@@ -163,6 +197,8 @@ public final class ItemSimilarityJob ext
                                         TextOutputFormat.class);
 
     similarity.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
+    similarity.setInt(NUMBER_OF_USERS, numberOfUsers);
+
     JobClient.runJob(similarity);
 
     return 0;
@@ -172,9 +208,22 @@ public final class ItemSimilarityJob ext
     ToolRunner.run(new ItemSimilarityJob(), args);
   }
 
-  static DistributedSimilarity instantiateSimilarity(String classname) {
+  static int readNumberOfUsers(JobConf conf, String outputFile) throws IOException {
+    FileSystem fs = FileSystem.get(conf);
+    InputStream in = null;
+    try  {
+      in = fs.open(new Path(outputFile));
+      ByteArrayOutputStream out = new ByteArrayOutputStream();
+      IOUtils.copyBytes(in, out, conf);
+      return Integer.parseInt(new String(out.toByteArray(), Charset.forName("UTF-8")).trim());
+    } finally {
+      IOUtils.closeStream(in);
+    }
+  }
+
+  static DistributedItemSimilarity instantiateSimilarity(String classname) {
     try {
-      return (DistributedSimilarity) Class.forName(classname).newInstance();
+      return (DistributedItemSimilarity) Class.forName(classname).newInstance();
     } catch (ClassNotFoundException cnfe) {
       throw new IllegalStateException(cnfe);
     } catch (InstantiationException ie) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java Sun May  9 13:36:38 2010
@@ -29,7 +29,7 @@ import org.apache.hadoop.mapred.OutputCo
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
 import org.apache.mahout.cf.taste.hadoop.EntityPrefWritableArrayWritable;
-import org.apache.mahout.cf.taste.hadoop.similarity.DistributedSimilarity;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedItemSimilarity;
 
 /**
  * for each item-vector, we compute its weight here and map out all entries with the user as key,
@@ -38,7 +38,7 @@ import org.apache.mahout.cf.taste.hadoop
 public final class PreferredItemsPerUserMapper extends MapReduceBase
     implements Mapper<VLongWritable,EntityPrefWritableArrayWritable,VLongWritable,ItemPrefWithItemVectorWeightWritable> {
 
-  private DistributedSimilarity distributedSimilarity;
+  private DistributedItemSimilarity distributedSimilarity;
 
   @Override
   public void configure(JobConf jobConf) {

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java Sun May  9 13:36:38 2010
@@ -28,21 +28,27 @@ import org.apache.hadoop.mapred.Reducer;
 import org.apache.hadoop.mapred.Reporter;
 import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
 import org.apache.mahout.cf.taste.hadoop.similarity.CoRating;
-import org.apache.mahout.cf.taste.hadoop.similarity.DistributedSimilarity;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedItemSimilarity;
 
 /**
- * Finally compute the similarity for each item-pair, that has been corated at least once
+ * Finally compute the similarity for each item-pair, that has been corated at least once.
+ * Computation is done with an external implementation of {@link DistributedItemSimilarity}.
  */
 public final class SimilarityReducer extends MapReduceBase
     implements Reducer<ItemPairWritable,CoRating,EntityEntityWritable,DoubleWritable> {
 
-  private DistributedSimilarity distributedSimilarity;
+  private DistributedItemSimilarity distributedItemSimilarity;
+  private int numberOfUsers;
 
   @Override
   public void configure(JobConf jobConf) {
     super.configure(jobConf);
-    distributedSimilarity =
+    distributedItemSimilarity =
       ItemSimilarityJob.instantiateSimilarity(jobConf.get(ItemSimilarityJob.DISTRIBUTED_SIMILARITY_CLASSNAME));
+    numberOfUsers = jobConf.getInt(ItemSimilarityJob.NUMBER_OF_USERS, -1);
+    if (numberOfUsers <= 0) {
+      throw new IllegalStateException("Number of users was not set correctly");
+    }
   }
 
   @Override
@@ -53,7 +59,7 @@ public final class SimilarityReducer ext
       throws IOException {
 
     double similarity =
-      distributedSimilarity.similarity(coRatings, pair.getItemAWeight(), pair.getItemBWeight());
+      distributedItemSimilarity.similarity(coRatings, pair.getItemAWeight(), pair.getItemBWeight(), numberOfUsers);
 
     if (!Double.isNaN(similarity)) {
       output.collect(pair.getItemItemWritable(), new DoubleWritable(similarity));

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java Sun May  9 13:36:38 2010
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedEuclideanDistanceSimilarity}
+ */
+public class DistributedEuclideanDistanceSimilarityTest extends
+    DistributedItemSimilarityTestCase {
+
+  public void testEuclideanDistance() throws Exception {
+
+    assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+        new Float[] { 3.0f, -2.0f },
+        new Float[] { 3.0f, -2.0f }, 1.0);
+
+    assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+        new Float[] { 3.0f, 3.0f },
+        new Float[] { 3.0f, 3.0f }, 1.0);
+
+    assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+        new Float[] { 1.0f, 2.0f, 3.0f },
+        new Float[] { 2.0f, 5.0f, 6.0f }, 0.5598164905901122);
+
+    assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+        new Float[] { 1.0f, Float.NaN },
+        new Float[] { Float.NaN, 1.0f }, 0.0);
+  }
+}

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java Sun May  9 13:36:38 2010
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob;
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+
+/**
+ * base testcase for all tests for implementations of {@link DistributedItemSimilarity}
+ */
+public abstract class DistributedItemSimilarityTestCase extends TasteTestCase {
+
+  /**
+   * emulates the way the similarity would be computed by {@link ItemSimilarityJob}
+   *
+   * @param similarity
+   * @param numberOfUsers
+   * @param prefsX
+   * @param prefsY
+   * @param expectedSimilarity
+   */
+  protected static void assertSimilar(DistributedItemSimilarity similarity,
+                                    int numberOfUsers,
+                                    Float[] prefsX,
+                                    Float[] prefsY,
+                                    double expectedSimilarity) {
+
+    List<Float> nonNaNPrefsX = new LinkedList<Float>();
+    for (Float prefX : prefsX) {
+      if (!prefX.isNaN()) {
+        nonNaNPrefsX.add(prefX);
+      }
+    }
+
+    List<Float> nonNaNPrefsY = new LinkedList<Float>();
+    for (Float prefY : prefsY) {
+      if (!prefY.isNaN()) {
+        nonNaNPrefsY.add(prefY);
+      }
+    }
+
+    double weightX = similarity.weightOfItemVector(nonNaNPrefsX.iterator());
+    double weightY = similarity.weightOfItemVector(nonNaNPrefsY.iterator());
+
+    List<CoRating> coRatings = new LinkedList<CoRating>();
+
+    for (int n = 0; n < prefsX.length; n++) {
+      Float x = prefsX[n];
+      Float y = prefsY[n];
+
+      if (!x.isNaN() && !y.isNaN()) {
+        coRatings.add(new CoRating(x, y));
+      }
+    }
+
+    double result = similarity.similarity(coRatings.iterator(), weightX, weightY, numberOfUsers);
+    assertEquals(expectedSimilarity, result, EPSILON);
+  }
+
+}

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java Sun May  9 13:36:38 2010
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedPearsonCorrelationSimilarity}
+ */
+public class DistributedPearsonCorrelationSimilarityTest extends DistributedItemSimilarityTestCase {
+
+  public void testPearsonCorrelation() throws Exception {
+
+    assertSimilar(new DistributedPearsonCorrelationSimilarity(), 2,
+        new Float[] { 3.0f, -2.0f },
+        new Float[] { 3.0f, -2.0f }, 1.0);
+
+    assertSimilar(new DistributedPearsonCorrelationSimilarity(), 2,
+        new Float[] { 3.0f, 3.0f },
+        new Float[] { 3.0f, 3.0f }, Double.NaN);
+
+    assertSimilar(new DistributedPearsonCorrelationSimilarity(), 2,
+        new Float[] { Float.NaN, 3.0f },
+        new Float[] { 3.0f, Float.NaN }, Double.NaN);
+  }
+}

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java Sun May  9 13:36:38 2010
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedTanimotoCoefficientSimilarity}
+ */
+public class DistributedTanimotoCoefficientSimilarityTestCase
+    extends DistributedItemSimilarityTestCase {
+
+  public void testTanimoto() throws Exception {
+
+    assertSimilar(new DistributedTanimotoCoefficientSimilarity(), 2,
+        new Float[] { Float.NaN, Float.NaN, Float.NaN, Float.NaN, 1.0f },
+        new Float[] { Float.NaN, 1.0f, 1.0f, 1.0f, 1.0f }, 0.25);
+
+    assertSimilar(new DistributedTanimotoCoefficientSimilarity(), 2,
+        new Float[] { Float.NaN, 1.0f },
+        new Float[] { 1.0f, Float.NaN }, Double.NaN);
+
+    assertSimilar(new DistributedTanimotoCoefficientSimilarity(), 2,
+        new Float[] { 1.0f, Float.NaN },
+        new Float[] { 1.0f, Float.NaN }, 1.0);
+  }
+}

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java Sun May  9 13:36:38 2010
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedUncenteredCosineSimilarity}
+ */
+public class DistributedUncenteredCosineSimilarityTest extends
+    DistributedItemSimilarityTestCase {
+
+  public void testUncenteredCosine() throws Exception {
+
+    assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+        new Float[] { Float.NaN, Float.NaN, Float.NaN, Float.NaN, 1.0f },
+        new Float[] { Float.NaN, 1.0f, 1.0f, 1.0f, 1.0f }, 1.0);
+
+    assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+        new Float[] { Float.NaN, 1.0f },
+        new Float[] { 1.0f, Float.NaN }, Double.NaN);
+
+    assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+        new Float[] { 1.0f, Float.NaN },
+        new Float[] { 1.0f, Float.NaN }, 1.0);
+
+    assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+        new Float[] { 1.0f, 1.0f, 2.0f },
+        new Float[] { 3.0f, 5.0f, Float.NaN }, 0.970142);
+  }
+
+}

Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java Sun May  9 13:36:38 2010
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedUncenteredZeroAssumingCosineSimilarity}
+ */
+public class DistributedUncenteredZeroAssumingCosineSimilarityTest extends
+    DistributedItemSimilarityTestCase {
+
+  public void testUncenteredZeroAssumingCosine() throws Exception {
+
+    assertSimilar(new DistributedUncenteredZeroAssumingCosineSimilarity(), 2,
+        new Float[] { Float.NaN, Float.NaN, Float.NaN, Float.NaN, 1.0f },
+        new Float[] { Float.NaN, 1.0f, 1.0f, 1.0f, 1.0f }, 0.5);
+
+    assertSimilar(new DistributedUncenteredZeroAssumingCosineSimilarity(), 2,
+        new Float[] { Float.NaN, 1.0f },
+        new Float[] { 1.0f, Float.NaN }, Double.NaN);
+
+    assertSimilar(new DistributedUncenteredZeroAssumingCosineSimilarity(), 2,
+        new Float[] { 1.0f, Float.NaN },
+        new Float[] { 1.0f, Float.NaN }, 1.0);
+  }
+}

Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java Sun May  9 13:36:38 2010
@@ -30,7 +30,9 @@ import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.VLongWritable;
 import org.apache.hadoop.mapred.JobConf;
@@ -63,6 +65,52 @@ public final class ItemSimilarityTest ex
     EasyMock.verify(output);
   }
 
+  public void testCountUsersMapper() throws Exception {
+    OutputCollector<CountUsersKeyWritable,VLongWritable> output = EasyMock.createMock(OutputCollector.class);
+    output.collect(keyForUserID(12L), EasyMock.eq(new VLongWritable(12L)));
+    output.collect(keyForUserID(35L), EasyMock.eq(new VLongWritable(35L)));
+    EasyMock.replay(output);
+
+    CountUsersMapper mapper = new CountUsersMapper();
+    mapper.map(null, new Text("12,100,1.3"), output, null);
+    mapper.map(null, new Text("35,100,3.0"), output, null);
+
+    EasyMock.verify(output);
+  }
+
+  static CountUsersKeyWritable keyForUserID(final long userID) {
+    EasyMock.reportMatcher(new IArgumentMatcher() {
+      @Override
+      public boolean matches(Object argument) {
+        if (argument instanceof CountUsersKeyWritable) {
+          CountUsersKeyWritable key = (CountUsersKeyWritable) argument;
+          return (userID == key.getUserID());
+        }
+        return false;
+      }
+
+      @Override
+      public void appendTo(StringBuffer buffer) {}
+    });
+
+    return null;
+  }
+
+  public void testCountUsersReducer() throws Exception {
+
+    OutputCollector<IntWritable,NullWritable> output = EasyMock.createMock(OutputCollector.class);
+    output.collect(new IntWritable(3), NullWritable.get());
+    EasyMock.replay(output);
+
+    List<VLongWritable> userIDs = Arrays.asList(new VLongWritable(1L), new VLongWritable(1L),
+                                                new VLongWritable(3L), new VLongWritable(5L),
+                                                new VLongWritable(5L), new VLongWritable(5L));
+
+    new CountUsersReducer().reduce(null, userIDs.iterator(), output, null);
+
+    EasyMock.verify(output);
+  }
+
   public void testToItemVectorReducer() throws Exception {
 
     List<EntityPrefWritable> userPrefs = Arrays.asList(
@@ -80,6 +128,7 @@ public final class ItemSimilarityTest ex
     EasyMock.verify(output);
   }
 
+
   static EntityPrefWritableArrayWritable equalToUserPrefs(
       final Collection<EntityPrefWritable> prefsToCheck) {
     EasyMock.reportMatcher(new IArgumentMatcher() {
@@ -217,6 +266,7 @@ public final class ItemSimilarityTest ex
     JobConf conf = new JobConf();
     conf.set(ItemSimilarityJob.DISTRIBUTED_SIMILARITY_CLASSNAME,
         "org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity");
+    conf.setInt(ItemSimilarityJob.NUMBER_OF_USERS, 1);
 
     output.collect(new EntityEntityWritable(12L, 34L), new DoubleWritable(0.5));
 
@@ -255,11 +305,11 @@ public final class ItemSimilarityTest ex
 
       BufferedWriter writer = new BufferedWriter(new FileWriter(tmpDirPath+"/prefs.txt"));
       try {
-        writer.write("1,2,1\n" +
+        writer.write("2,1,1\n" +
+                     "1,2,1\n" +
+                     "3,4,1\n" +
                      "1,3,2\n" +
-                     "2,1,1\n" +
-                     "2,3,1\n" +
-                     "3,4,1\n");
+                     "2,3,1\n");
       } finally {
         writer.close();
       }
@@ -276,6 +326,10 @@ public final class ItemSimilarityTest ex
       similarityJob.run(new String[] { "--tempDir", tmpDirPath+"/tmp", "--similarityClassname",
           "org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity"});
 
+      int numberOfUsers = ItemSimilarityJob.readNumberOfUsers(new JobConf(), tmpDirPath + "/tmp/countUsers/part-00000");
+
+      assertEquals(3, numberOfUsers);
+
       String filePath = tmpDirPath+"/output/part-00000";
       BufferedReader reader = new BufferedReader(new FileReader(filePath));