You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2010/05/09 15:36:39 UTC
svn commit: r942537 - in /lucene/mahout/trunk/core/src:
main/java/org/apache/mahout/cf/taste/hadoop/similarity/
main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/
test/java/org/apache/mahout/cf/taste/hadoop/similarity/
test/java/org/apache/ma...
Author: srowen
Date: Sun May 9 13:36:38 2010
New Revision: 942537
URL: http://svn.apache.org/viewvc?rev=942537&view=rev
Log:
MAHOUT-393
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java
Removed:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedSimilarity.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedSimilarityTest.java
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java
lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/AbstractDistributedItemSimilarity.java Sun May 9 13:36:38 2010
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+/**
+ * abstract base class for all implementations of {@link DistributedItemSimilarity} that does not give a
+ * weight to item vectors and only ensures that the result is within [-1,1]
+ */
+public abstract class AbstractDistributedItemSimilarity
+ implements DistributedItemSimilarity {
+
+ @Override
+ public final double similarity(Iterator<CoRating> coratings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
+
+ double result = doComputeResult(coratings, weightOfItemVectorX, weightOfItemVectorY, numberOfUsers);
+
+ if (result < -1.0) {
+ result = -1.0;
+ } else if (result > 1.0) {
+ result = 1.0;
+ }
+ return result;
+ }
+
+ /**
+ * do not compute a weight by default, subclasses can override this
+ * when they need a weight
+ */
+ @Override
+ public double weightOfItemVector(Iterator<Float> prefValues) {
+ return Double.NaN;
+ }
+
+ protected abstract double doComputeResult(Iterator<CoRating> coratings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers);
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarity.java Sun May 9 13:36:38 2010
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.EuclideanDistanceSimilarity;
+
+/**
+ * distributed version of {@link EuclideanDistanceSimilarity}
+ */
+public class DistributedEuclideanDistanceSimilarity extends AbstractDistributedItemSimilarity {
+
+ @Override
+ protected double doComputeResult(Iterator<CoRating> coratings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
+
+ double n=0;
+ double sumXYdiff2 = 0;
+
+ while (coratings.hasNext()) {
+ CoRating coRating = coratings.next();
+ double diff = coRating.getPrefValueX() - coRating.getPrefValueY();
+ sumXYdiff2 += diff * diff;
+ n++;
+ }
+
+ return (n / (1.0 + Math.sqrt(sumXYdiff2)));
+ }
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarity.java Sun May 9 13:36:38 2010
@@ -0,0 +1,48 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+/**
+ * Modelling the pairwise similarity computation of items in a distributed manner
+ */
+public interface DistributedItemSimilarity {
+
+ /**
+ * compute the weight of an item vector (called in an early stage of the map-reduce steps)
+ *
+ * @param prefValues
+ * @return
+ */
+ double weightOfItemVector(Iterator<Float> prefValues);
+
+ /**
+ * compute the similarity for a pair of item-vectors
+ *
+ * @param coratings all coratings for these items
+ * @param weightOfItemVectorX the weight computed for the first vector
+ * @param weightOfItemVectorY the weight computed for the second vector
+ * @param numberOfUsers the overall number of users
+ * @return
+ */
+ double similarity(Iterator<CoRating> coratings,
+ double weightOfItemVectorX,
+ double weightOfItemVectorY,
+ int numberOfUsers);
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedLogLikelihoodSimilarity.java Sun May 9 13:36:38 2010
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.LogLikelihoodSimilarity;
+
+/**
+ * Distributed version of {@link LogLikelihoodSimilarity}
+ */
+public class DistributedLogLikelihoodSimilarity extends AbstractDistributedItemSimilarity {
+
+ @Override
+ protected double doComputeResult(Iterator<CoRating> coratings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
+
+ int preferringXandY = 0;
+ while (coratings.hasNext()) {
+ coratings.next();
+ preferringXandY++;
+ }
+
+ if (preferringXandY == 0) {
+ return Double.NaN;
+ }
+
+ int preferringX = (int) weightOfItemVectorX;
+ int preferringY = (int) weightOfItemVectorY;
+
+ double logLikelihood = twoLogLambda(preferringXandY,
+ preferringX - preferringXandY,
+ preferringY,
+ numberOfUsers - preferringY);
+
+ return 1.0 - 1.0 / (1.0 + logLikelihood);
+ }
+
+ @Override
+ public double weightOfItemVector(Iterator<Float> prefValues) {
+ double nonZeroEntries = 0;
+ while (prefValues.hasNext()) {
+ prefValues.next();
+ nonZeroEntries++;
+ }
+ return nonZeroEntries;
+ }
+
+ private static double twoLogLambda(double k1, double k2, double n1, double n2) {
+ double p = (k1 + k2) / (n1 + n2);
+ return 2.0 * (logL(k1 / n1, k1, n1)
+ + logL(k2 / n2, k2, n2)
+ - logL(p, k1, n1)
+ - logL(p, k2, n2));
+ }
+
+ private static double logL(double p, double k, double n) {
+ return k * safeLog(p) + (n - k) * safeLog(1.0 - p);
+ }
+
+ private static double safeLog(double d) {
+ return d <= 0.0 ? 0.0 : Math.log(d);
+ }
+
+}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarity.java Sun May 9 13:36:38 2010
@@ -19,10 +19,17 @@ package org.apache.mahout.cf.taste.hadoo
import java.util.Iterator;
-public class DistributedPearsonCorrelationSimilarity implements DistributedSimilarity {
+import org.apache.mahout.cf.taste.impl.similarity.PearsonCorrelationSimilarity;
+
+/**
+ * Distributed version of {@link PearsonCorrelationSimilarity}
+ */
+public class DistributedPearsonCorrelationSimilarity extends AbstractDistributedItemSimilarity {
@Override
- public double similarity(Iterator<CoRating> coRatings, double weightOfItemVectorX, double weightOfItemVectorY) {
+ protected double doComputeResult(Iterator<CoRating> coRatings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
int count = 0;
double sumX = 0.0;
@@ -68,10 +75,4 @@ public class DistributedPearsonCorrelati
return centeredSumXY / denominator;
}
-
- @Override
- public double weightOfItemVector(Iterator<Float> prefValues) {
- return Double.NaN;
- }
-
}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarity.java Sun May 9 13:36:38 2010
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.TanimotoCoefficientSimilarity;
+
+/**
+ * Distributed version of {@link TanimotoCoefficientSimilarity}
+ */
+public class DistributedTanimotoCoefficientSimilarity extends AbstractDistributedItemSimilarity {
+
+ @Override
+ protected double doComputeResult(Iterator<CoRating> coratings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
+
+ double preferringXAndY = 0;
+ while (coratings.hasNext()) {
+ coratings.next();
+ preferringXAndY++;
+ }
+
+ if (preferringXAndY == 0) {
+ return Double.NaN;
+ }
+
+ return (preferringXAndY / (weightOfItemVectorX + weightOfItemVectorY - preferringXAndY));
+ }
+
+ @Override
+ public double weightOfItemVector(Iterator<Float> prefValues) {
+ double nonZeroEntries = 0;
+ while (prefValues.hasNext()) {
+ prefValues.next();
+ nonZeroEntries++;
+ }
+ return nonZeroEntries;
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarity.java Sun May 9 13:36:38 2010
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.Iterator;
+
+import org.apache.mahout.cf.taste.impl.similarity.UncenteredCosineSimilarity;
+
+/**
+ * Distributed version of {@link UncenteredCosineSimilarity}
+ */
+public class DistributedUncenteredCosineSimilarity extends AbstractDistributedItemSimilarity {
+
+ @Override
+ protected double doComputeResult(Iterator<CoRating> coratings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
+
+ int n = 0;
+ double sumXY = 0;
+ double sumX2 = 0;
+ double sumY2 = 0;
+
+ while (coratings.hasNext()) {
+ CoRating coRating = coratings.next();
+ double x = coRating.getPrefValueX();
+ double y = coRating.getPrefValueY();
+
+ sumXY += x * y;
+ sumX2 += x * x;
+ sumY2 += y * y;
+ n++;
+ }
+
+ if (n == 0) {
+ return Double.NaN;
+ }
+ double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+ if (denominator == 0.0) {
+ // One or both parties has -all- the same ratings;
+ // can't really say much similarity under this measure
+ return Double.NaN;
+ }
+ return sumXY / denominator;
+
+ }
+}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarity.java Sun May 9 13:36:38 2010
@@ -19,11 +19,17 @@ package org.apache.mahout.cf.taste.hadoo
import java.util.Iterator;
+/**
+ * distributed cosine similarity that assumes that all unknown preferences
+ * are zeros and that does not center data
+ */
public final class DistributedUncenteredZeroAssumingCosineSimilarity
- implements DistributedSimilarity {
+ extends AbstractDistributedItemSimilarity {
@Override
- public double similarity(Iterator<CoRating> coRatings, double weightOfItemVectorX, double weightOfItemVectorY) {
+ protected double doComputeResult(Iterator<CoRating> coRatings,
+ double weightOfItemVectorX, double weightOfItemVectorY,
+ int numberOfUsers) {
double sumXY = 0;
while (coRatings.hasNext()) {
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersKeyWritable.java Sun May 9 13:36:38 2010
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.io.Serializable;
+
+import org.apache.hadoop.io.VLongWritable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.io.WritableComparator;
+import org.apache.hadoop.io.WritableUtils;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Partitioner;
+import org.apache.mahout.common.RandomUtils;
+
+/**
+ * a writable key that is used by {@link CountUsersMapper} and {@link CountUsersReducer} to
+ * count unique users by sending all userIDs to the same reducer and have them sorted in
+ * ascending order so that there's no buffering necessary when counting them
+ */
+public class CountUsersKeyWritable implements WritableComparable<CountUsersKeyWritable> {
+
+ private long userID;
+
+ public CountUsersKeyWritable() {
+ }
+
+ public CountUsersKeyWritable(long userID) {
+ this.userID = userID;
+ }
+
+ public long getUserID() {
+ return userID;
+ }
+
+ @Override
+ public void readFields(DataInput in) throws IOException {
+ userID = WritableUtils.readVLong(in);
+ }
+
+ @Override
+ public void write(DataOutput out) throws IOException {
+ WritableUtils.writeVLong(out, userID);
+ }
+
+ @Override
+ public int compareTo(CountUsersKeyWritable other) {
+ return (userID == other.userID ? 0 : userID < other.userID ? -1 : 1);
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof CountUsersKeyWritable)) {
+ return false;
+ }
+ return userID == ((CountUsersKeyWritable) other).userID;
+ }
+
+ @Override
+ public int hashCode() {
+ return RandomUtils.hashLong(userID);
+ }
+
+ /**
+ * all userIDs go to the same partition
+ */
+ public static class CountUsersPartitioner implements Partitioner<CountUsersKeyWritable,VLongWritable> {
+
+ @Override
+ public int getPartition(CountUsersKeyWritable key, VLongWritable value, int numPartitions) {
+ return 0;
+ }
+
+ @Override
+ public void configure(JobConf conf) {}
+ }
+
+ /**
+ * all userIDs go to the same reducer
+ */
+ public static class CountUsersGroupComparator extends WritableComparator implements Serializable {
+
+ public CountUsersGroupComparator() {
+ super(CountUsersKeyWritable.class, true);
+ }
+
+ @Override
+ public int compare(WritableComparable a, WritableComparable b) {
+ return 0;
+ }
+ }
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java Sun May 9 13:36:38 2010
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.VLongWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * maps out the userIDs in a way that we can use a secondary sort on them
+ */
+public class CountUsersMapper extends MapReduceBase
+ implements Mapper<LongWritable,Text,CountUsersKeyWritable,VLongWritable> {
+
+ private static final Pattern DELIMITER = Pattern.compile("[\t,]");
+
+ @Override
+ public void map(LongWritable arg0, Text value,
+ OutputCollector<CountUsersKeyWritable,VLongWritable> out, Reporter reporter)
+ throws IOException {
+
+ String[] tokens = DELIMITER.split(value.toString());
+ long userID = Long.parseLong(tokens[0]);
+
+ out.collect(new CountUsersKeyWritable(userID), new VLongWritable(userID));
+ }
+
+}
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersReducer.java Sun May 9 13:36:38 2010
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity.item;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.VLongWritable;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * counts all unique users, we ensure that we see userIDs sorted in ascending order via
+ * secondary sort, so we don't have to buffer all of them
+ */
+public class CountUsersReducer extends MapReduceBase
+ implements Reducer<CountUsersKeyWritable,VLongWritable,IntWritable,NullWritable> {
+
+ @Override
+ public void reduce(CountUsersKeyWritable key, Iterator<VLongWritable> userIDs,
+ OutputCollector<IntWritable,NullWritable> out, Reporter reporter)
+ throws IOException {
+
+ long lastSeenUserID = Long.MIN_VALUE;
+ int numberOfUsers = 0;
+
+ while (userIDs.hasNext()) {
+ long currentUserID = userIDs.next().get();
+ if (currentUserID > lastSeenUserID) {
+ lastSeenUserID = currentUserID;
+ numberOfUsers++;
+ }
+ }
+ out.collect(new IntWritable(numberOfUsers), NullWritable.get());
+ }
+
+}
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Sun May 9 13:36:38 2010
@@ -17,12 +17,20 @@
package org.apache.mahout.cf.taste.hadoop.similarity.item;
+import java.io.ByteArrayOutputStream;
import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
import java.util.Map;
import org.apache.commons.cli2.Option;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.VLongWritable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
@@ -36,7 +44,7 @@ import org.apache.mahout.cf.taste.hadoop
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritableArrayWritable;
import org.apache.mahout.cf.taste.hadoop.ToUserPrefsMapper;
import org.apache.mahout.cf.taste.hadoop.similarity.CoRating;
-import org.apache.mahout.cf.taste.hadoop.similarity.DistributedSimilarity;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedItemSimilarity;
import org.apache.mahout.common.AbstractJob;
/**
@@ -87,7 +95,7 @@ import org.apache.mahout.common.Abstract
* the form userID,itemID,preference
* computed, one per line</li>
* <li>-Dmapred.output.dir=(path): output path where the computations output should go</li>
- * <li>--similarityClassname (classname): an implemenation of {@link DistributedSimilarity} used to compute the
+ * <li>--similarityClassname (classname): an implemenation of {@link DistributedItemSimilarity} used to compute the
* similarity</li>
* </ol>
*
@@ -103,6 +111,9 @@ public final class ItemSimilarityJob ext
public static final String DISTRIBUTED_SIMILARITY_CLASSNAME =
"org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.distributedSimilarityClassname";
+ public static final String NUMBER_OF_USERS =
+ "org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob.numberOfUsers";
+
@Override
public int run(String[] args) throws IOException {
@@ -122,9 +133,32 @@ public final class ItemSimilarityJob ext
String outputPath = originalConf.get("mapred.output.dir");
String tempDirPath = parsedArgs.get("--tempDir");
+ String countUsersPath = tempDirPath + "/countUsers";
String itemVectorsPath = tempDirPath + "/itemVectors";
String userVectorsPath = tempDirPath + "/userVectors";
+ /* count all unique users */
+ JobConf countUsers = prepareJobConf(inputPath,
+ countUsersPath,
+ TextInputFormat.class,
+ CountUsersMapper.class,
+ CountUsersKeyWritable.class,
+ VLongWritable.class,
+ CountUsersReducer.class,
+ IntWritable.class,
+ NullWritable.class,
+ TextOutputFormat.class);
+
+ countUsers.setPartitionerClass(
+ CountUsersKeyWritable.CountUsersPartitioner.class);
+ countUsers.setOutputValueGroupingComparator(
+ CountUsersKeyWritable.CountUsersGroupComparator.class);
+
+ JobClient.runJob(countUsers);
+
+ int numberOfUsers =
+ readNumberOfUsers(countUsers, (countUsersPath + "/part-00000"));
+
JobConf itemVectors = prepareJobConf(inputPath,
itemVectorsPath,
TextInputFormat.class,
@@ -163,6 +197,8 @@ public final class ItemSimilarityJob ext
TextOutputFormat.class);
similarity.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname);
+ similarity.setInt(NUMBER_OF_USERS, numberOfUsers);
+
JobClient.runJob(similarity);
return 0;
@@ -172,9 +208,22 @@ public final class ItemSimilarityJob ext
ToolRunner.run(new ItemSimilarityJob(), args);
}
- static DistributedSimilarity instantiateSimilarity(String classname) {
+ static int readNumberOfUsers(JobConf conf, String outputFile) throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+ InputStream in = null;
+ try {
+ in = fs.open(new Path(outputFile));
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ IOUtils.copyBytes(in, out, conf);
+ return Integer.parseInt(new String(out.toByteArray(), Charset.forName("UTF-8")).trim());
+ } finally {
+ IOUtils.closeStream(in);
+ }
+ }
+
+ static DistributedItemSimilarity instantiateSimilarity(String classname) {
try {
- return (DistributedSimilarity) Class.forName(classname).newInstance();
+ return (DistributedItemSimilarity) Class.forName(classname).newInstance();
} catch (ClassNotFoundException cnfe) {
throw new IllegalStateException(cnfe);
} catch (InstantiationException ie) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/PreferredItemsPerUserMapper.java Sun May 9 13:36:38 2010
@@ -29,7 +29,7 @@ import org.apache.hadoop.mapred.OutputCo
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritable;
import org.apache.mahout.cf.taste.hadoop.EntityPrefWritableArrayWritable;
-import org.apache.mahout.cf.taste.hadoop.similarity.DistributedSimilarity;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedItemSimilarity;
/**
* for each item-vector, we compute its weight here and map out all entries with the user as key,
@@ -38,7 +38,7 @@ import org.apache.mahout.cf.taste.hadoop
public final class PreferredItemsPerUserMapper extends MapReduceBase
implements Mapper<VLongWritable,EntityPrefWritableArrayWritable,VLongWritable,ItemPrefWithItemVectorWeightWritable> {
- private DistributedSimilarity distributedSimilarity;
+ private DistributedItemSimilarity distributedSimilarity;
@Override
public void configure(JobConf jobConf) {
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/SimilarityReducer.java Sun May 9 13:36:38 2010
@@ -28,21 +28,27 @@ import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
import org.apache.mahout.cf.taste.hadoop.similarity.CoRating;
-import org.apache.mahout.cf.taste.hadoop.similarity.DistributedSimilarity;
+import org.apache.mahout.cf.taste.hadoop.similarity.DistributedItemSimilarity;
/**
- * Finally compute the similarity for each item-pair, that has been corated at least once
+ * Finally compute the similarity for each item-pair, that has been corated at least once.
+ * Computation is done with an external implementation of {@link DistributedItemSimilarity}.
*/
public final class SimilarityReducer extends MapReduceBase
implements Reducer<ItemPairWritable,CoRating,EntityEntityWritable,DoubleWritable> {
- private DistributedSimilarity distributedSimilarity;
+ private DistributedItemSimilarity distributedItemSimilarity;
+ private int numberOfUsers;
@Override
public void configure(JobConf jobConf) {
super.configure(jobConf);
- distributedSimilarity =
+ distributedItemSimilarity =
ItemSimilarityJob.instantiateSimilarity(jobConf.get(ItemSimilarityJob.DISTRIBUTED_SIMILARITY_CLASSNAME));
+ numberOfUsers = jobConf.getInt(ItemSimilarityJob.NUMBER_OF_USERS, -1);
+ if (numberOfUsers <= 0) {
+ throw new IllegalStateException("Number of users was not set correctly");
+ }
}
@Override
@@ -53,7 +59,7 @@ public final class SimilarityReducer ext
throws IOException {
double similarity =
- distributedSimilarity.similarity(coRatings, pair.getItemAWeight(), pair.getItemBWeight());
+ distributedItemSimilarity.similarity(coRatings, pair.getItemAWeight(), pair.getItemBWeight(), numberOfUsers);
if (!Double.isNaN(similarity)) {
output.collect(pair.getItemItemWritable(), new DoubleWritable(similarity));
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedEuclideanDistanceSimilarityTest.java Sun May 9 13:36:38 2010
@@ -0,0 +1,44 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedEuclideanDistanceSimilarity}
+ */
+public class DistributedEuclideanDistanceSimilarityTest extends
+ DistributedItemSimilarityTestCase {
+
+ public void testEuclideanDistance() throws Exception {
+
+ assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+ new Float[] { 3.0f, -2.0f },
+ new Float[] { 3.0f, -2.0f }, 1.0);
+
+ assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+ new Float[] { 3.0f, 3.0f },
+ new Float[] { 3.0f, 3.0f }, 1.0);
+
+ assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+ new Float[] { 1.0f, 2.0f, 3.0f },
+ new Float[] { 2.0f, 5.0f, 6.0f }, 0.5598164905901122);
+
+ assertSimilar(new DistributedEuclideanDistanceSimilarity(), 2,
+ new Float[] { 1.0f, Float.NaN },
+ new Float[] { Float.NaN, 1.0f }, 0.0);
+ }
+}
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedItemSimilarityTestCase.java Sun May 9 13:36:38 2010
@@ -0,0 +1,78 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob;
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+
+/**
+ * base testcase for all tests for implementations of {@link DistributedItemSimilarity}
+ */
+public abstract class DistributedItemSimilarityTestCase extends TasteTestCase {
+
+ /**
+ * emulates the way the similarity would be computed by {@link ItemSimilarityJob}
+ *
+ * @param similarity
+ * @param numberOfUsers
+ * @param prefsX
+ * @param prefsY
+ * @param expectedSimilarity
+ */
+ protected static void assertSimilar(DistributedItemSimilarity similarity,
+ int numberOfUsers,
+ Float[] prefsX,
+ Float[] prefsY,
+ double expectedSimilarity) {
+
+ List<Float> nonNaNPrefsX = new LinkedList<Float>();
+ for (Float prefX : prefsX) {
+ if (!prefX.isNaN()) {
+ nonNaNPrefsX.add(prefX);
+ }
+ }
+
+ List<Float> nonNaNPrefsY = new LinkedList<Float>();
+ for (Float prefY : prefsY) {
+ if (!prefY.isNaN()) {
+ nonNaNPrefsY.add(prefY);
+ }
+ }
+
+ double weightX = similarity.weightOfItemVector(nonNaNPrefsX.iterator());
+ double weightY = similarity.weightOfItemVector(nonNaNPrefsY.iterator());
+
+ List<CoRating> coRatings = new LinkedList<CoRating>();
+
+ for (int n = 0; n < prefsX.length; n++) {
+ Float x = prefsX[n];
+ Float y = prefsY[n];
+
+ if (!x.isNaN() && !y.isNaN()) {
+ coRatings.add(new CoRating(x, y));
+ }
+ }
+
+ double result = similarity.similarity(coRatings.iterator(), weightX, weightY, numberOfUsers);
+ assertEquals(expectedSimilarity, result, EPSILON);
+ }
+
+}
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedPearsonCorrelationSimilarityTest.java Sun May 9 13:36:38 2010
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedPearsonCorrelationSimilarity}
+ */
+public class DistributedPearsonCorrelationSimilarityTest extends DistributedItemSimilarityTestCase {
+
+ public void testPearsonCorrelation() throws Exception {
+
+ assertSimilar(new DistributedPearsonCorrelationSimilarity(), 2,
+ new Float[] { 3.0f, -2.0f },
+ new Float[] { 3.0f, -2.0f }, 1.0);
+
+ assertSimilar(new DistributedPearsonCorrelationSimilarity(), 2,
+ new Float[] { 3.0f, 3.0f },
+ new Float[] { 3.0f, 3.0f }, Double.NaN);
+
+ assertSimilar(new DistributedPearsonCorrelationSimilarity(), 2,
+ new Float[] { Float.NaN, 3.0f },
+ new Float[] { 3.0f, Float.NaN }, Double.NaN);
+ }
+}
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedTanimotoCoefficientSimilarityTestCase.java Sun May 9 13:36:38 2010
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedTanimotoCoefficientSimilarity}
+ */
+public class DistributedTanimotoCoefficientSimilarityTestCase
+ extends DistributedItemSimilarityTestCase {
+
+ public void testTanimoto() throws Exception {
+
+ assertSimilar(new DistributedTanimotoCoefficientSimilarity(), 2,
+ new Float[] { Float.NaN, Float.NaN, Float.NaN, Float.NaN, 1.0f },
+ new Float[] { Float.NaN, 1.0f, 1.0f, 1.0f, 1.0f }, 0.25);
+
+ assertSimilar(new DistributedTanimotoCoefficientSimilarity(), 2,
+ new Float[] { Float.NaN, 1.0f },
+ new Float[] { 1.0f, Float.NaN }, Double.NaN);
+
+ assertSimilar(new DistributedTanimotoCoefficientSimilarity(), 2,
+ new Float[] { 1.0f, Float.NaN },
+ new Float[] { 1.0f, Float.NaN }, 1.0);
+ }
+}
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredCosineSimilarityTest.java Sun May 9 13:36:38 2010
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedUncenteredCosineSimilarity}
+ */
+public class DistributedUncenteredCosineSimilarityTest extends
+ DistributedItemSimilarityTestCase {
+
+ public void testUncenteredCosine() throws Exception {
+
+ assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+ new Float[] { Float.NaN, Float.NaN, Float.NaN, Float.NaN, 1.0f },
+ new Float[] { Float.NaN, 1.0f, 1.0f, 1.0f, 1.0f }, 1.0);
+
+ assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+ new Float[] { Float.NaN, 1.0f },
+ new Float[] { 1.0f, Float.NaN }, Double.NaN);
+
+ assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+ new Float[] { 1.0f, Float.NaN },
+ new Float[] { 1.0f, Float.NaN }, 1.0);
+
+ assertSimilar(new DistributedUncenteredCosineSimilarity(), 2,
+ new Float[] { 1.0f, 1.0f, 2.0f },
+ new Float[] { 3.0f, 5.0f, Float.NaN }, 0.970142);
+ }
+
+}
Added: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java?rev=942537&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java (added)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/DistributedUncenteredZeroAssumingCosineSimilarityTest.java Sun May 9 13:36:38 2010
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.similarity;
+
+/**
+ * test for {@link DistributedUncenteredZeroAssumingCosineSimilarity}
+ */
+public class DistributedUncenteredZeroAssumingCosineSimilarityTest extends
+ DistributedItemSimilarityTestCase {
+
+ public void testUncenteredZeroAssumingCosine() throws Exception {
+
+ assertSimilar(new DistributedUncenteredZeroAssumingCosineSimilarity(), 2,
+ new Float[] { Float.NaN, Float.NaN, Float.NaN, Float.NaN, 1.0f },
+ new Float[] { Float.NaN, 1.0f, 1.0f, 1.0f, 1.0f }, 0.5);
+
+ assertSimilar(new DistributedUncenteredZeroAssumingCosineSimilarity(), 2,
+ new Float[] { Float.NaN, 1.0f },
+ new Float[] { 1.0f, Float.NaN }, Double.NaN);
+
+ assertSimilar(new DistributedUncenteredZeroAssumingCosineSimilarity(), 2,
+ new Float[] { 1.0f, Float.NaN },
+ new Float[] { 1.0f, Float.NaN }, 1.0);
+ }
+}
Modified: lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java?rev=942537&r1=942536&r2=942537&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java (original)
+++ lucene/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java Sun May 9 13:36:38 2010
@@ -30,7 +30,9 @@ import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.VLongWritable;
import org.apache.hadoop.mapred.JobConf;
@@ -63,6 +65,52 @@ public final class ItemSimilarityTest ex
EasyMock.verify(output);
}
+ public void testCountUsersMapper() throws Exception {
+ OutputCollector<CountUsersKeyWritable,VLongWritable> output = EasyMock.createMock(OutputCollector.class);
+ output.collect(keyForUserID(12L), EasyMock.eq(new VLongWritable(12L)));
+ output.collect(keyForUserID(35L), EasyMock.eq(new VLongWritable(35L)));
+ EasyMock.replay(output);
+
+ CountUsersMapper mapper = new CountUsersMapper();
+ mapper.map(null, new Text("12,100,1.3"), output, null);
+ mapper.map(null, new Text("35,100,3.0"), output, null);
+
+ EasyMock.verify(output);
+ }
+
+ static CountUsersKeyWritable keyForUserID(final long userID) {
+ EasyMock.reportMatcher(new IArgumentMatcher() {
+ @Override
+ public boolean matches(Object argument) {
+ if (argument instanceof CountUsersKeyWritable) {
+ CountUsersKeyWritable key = (CountUsersKeyWritable) argument;
+ return (userID == key.getUserID());
+ }
+ return false;
+ }
+
+ @Override
+ public void appendTo(StringBuffer buffer) {}
+ });
+
+ return null;
+ }
+
+ public void testCountUsersReducer() throws Exception {
+
+ OutputCollector<IntWritable,NullWritable> output = EasyMock.createMock(OutputCollector.class);
+ output.collect(new IntWritable(3), NullWritable.get());
+ EasyMock.replay(output);
+
+ List<VLongWritable> userIDs = Arrays.asList(new VLongWritable(1L), new VLongWritable(1L),
+ new VLongWritable(3L), new VLongWritable(5L),
+ new VLongWritable(5L), new VLongWritable(5L));
+
+ new CountUsersReducer().reduce(null, userIDs.iterator(), output, null);
+
+ EasyMock.verify(output);
+ }
+
public void testToItemVectorReducer() throws Exception {
List<EntityPrefWritable> userPrefs = Arrays.asList(
@@ -80,6 +128,7 @@ public final class ItemSimilarityTest ex
EasyMock.verify(output);
}
+
static EntityPrefWritableArrayWritable equalToUserPrefs(
final Collection<EntityPrefWritable> prefsToCheck) {
EasyMock.reportMatcher(new IArgumentMatcher() {
@@ -217,6 +266,7 @@ public final class ItemSimilarityTest ex
JobConf conf = new JobConf();
conf.set(ItemSimilarityJob.DISTRIBUTED_SIMILARITY_CLASSNAME,
"org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity");
+ conf.setInt(ItemSimilarityJob.NUMBER_OF_USERS, 1);
output.collect(new EntityEntityWritable(12L, 34L), new DoubleWritable(0.5));
@@ -255,11 +305,11 @@ public final class ItemSimilarityTest ex
BufferedWriter writer = new BufferedWriter(new FileWriter(tmpDirPath+"/prefs.txt"));
try {
- writer.write("1,2,1\n" +
+ writer.write("2,1,1\n" +
+ "1,2,1\n" +
+ "3,4,1\n" +
"1,3,2\n" +
- "2,1,1\n" +
- "2,3,1\n" +
- "3,4,1\n");
+ "2,3,1\n");
} finally {
writer.close();
}
@@ -276,6 +326,10 @@ public final class ItemSimilarityTest ex
similarityJob.run(new String[] { "--tempDir", tmpDirPath+"/tmp", "--similarityClassname",
"org.apache.mahout.cf.taste.hadoop.similarity.DistributedUncenteredZeroAssumingCosineSimilarity"});
+ int numberOfUsers = ItemSimilarityJob.readNumberOfUsers(new JobConf(), tmpDirPath + "/tmp/countUsers/part-00000");
+
+ assertEquals(3, numberOfUsers);
+
String filePath = tmpDirPath+"/output/part-00000";
BufferedReader reader = new BufferedReader(new FileReader(filePath));