You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ra...@apache.org on 2018/06/04 14:29:43 UTC
[41/53] [abbrv] [partial] mahout git commit: end of day 6-2-2018
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
new file mode 100644
index 0000000..59c30d9
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java
@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/** Abstract superclass encapsulating functionality that is common to most implementations in this package. */
+abstract class AbstractSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ private PreferenceInferrer inferrer;
+ private final boolean weighted;
+ private final boolean centerData;
+ private int cachedNumItems;
+ private int cachedNumUsers;
+ private final RefreshHelper refreshHelper;
+
+ /**
+ * <p>
+ * Creates a possibly weighted {@link AbstractSimilarity}.
+ * </p>
+ */
+ AbstractSimilarity(final DataModel dataModel, Weighting weighting, boolean centerData) throws TasteException {
+ super(dataModel);
+ this.weighted = weighting == Weighting.WEIGHTED;
+ this.centerData = centerData;
+ this.cachedNumItems = dataModel.getNumItems();
+ this.cachedNumUsers = dataModel.getNumUsers();
+ this.refreshHelper = new RefreshHelper(new Callable<Object>() {
+ @Override
+ public Object call() throws TasteException {
+ cachedNumItems = dataModel.getNumItems();
+ cachedNumUsers = dataModel.getNumUsers();
+ return null;
+ }
+ });
+ }
+
+ final PreferenceInferrer getPreferenceInferrer() {
+ return inferrer;
+ }
+
+ @Override
+ public final void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ Preconditions.checkArgument(inferrer != null, "inferrer is null");
+ refreshHelper.addDependency(inferrer);
+ refreshHelper.removeDependency(this.inferrer);
+ this.inferrer = inferrer;
+ }
+
+ final boolean isWeighted() {
+ return weighted;
+ }
+
+ /**
+ * <p>
+ * Several subclasses in this package implement this method to actually compute the similarity from figures
+ * computed over users or items. Note that the computations in this class "center" the data, such that X and
+ * Y's mean are 0.
+ * </p>
+ *
+ * <p>
+ * Note that the sum of all X and Y values must then be 0. This value isn't passed down into the standard
+ * similarity computations as a result.
+ * </p>
+ *
+ * @param n
+ * total number of users or items
+ * @param sumXY
+ * sum of product of user/item preference values, over all items/users preferred by both
+ * users/items
+ * @param sumX2
+ * sum of the square of user/item preference values, over the first item/user
+ * @param sumY2
+ * sum of the square of the user/item preference values, over the second item/user
+ * @param sumXYdiff2
+ * sum of squares of differences in X and Y values
+ * @return similarity value between -1.0 and 1.0, inclusive, or {@link Double#NaN} if no similarity can be
+ * computed (e.g. when no items have been rated by both users
+ */
+ abstract double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2);
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
+ PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
+ int xLength = xPrefs.length();
+ int yLength = yPrefs.length();
+
+ if (xLength == 0 || yLength == 0) {
+ return Double.NaN;
+ }
+
+ long xIndex = xPrefs.getItemID(0);
+ long yIndex = yPrefs.getItemID(0);
+ int xPrefIndex = 0;
+ int yPrefIndex = 0;
+
+ double sumX = 0.0;
+ double sumX2 = 0.0;
+ double sumY = 0.0;
+ double sumY2 = 0.0;
+ double sumXY = 0.0;
+ double sumXYdiff2 = 0.0;
+ int count = 0;
+
+ boolean hasInferrer = inferrer != null;
+
+ while (true) {
+ int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+ if (hasInferrer || compare == 0) {
+ double x;
+ double y;
+ if (xIndex == yIndex) {
+ // Both users expressed a preference for the item
+ x = xPrefs.getValue(xPrefIndex);
+ y = yPrefs.getValue(yPrefIndex);
+ } else {
+ // Only one user expressed a preference, but infer the other one's preference and tally
+ // as if the other user expressed that preference
+ if (compare < 0) {
+ // X has a value; infer Y's
+ x = xPrefs.getValue(xPrefIndex);
+ y = inferrer.inferPreference(userID2, xIndex);
+ } else {
+ // compare > 0
+ // Y has a value; infer X's
+ x = inferrer.inferPreference(userID1, yIndex);
+ y = yPrefs.getValue(yPrefIndex);
+ }
+ }
+ sumXY += x * y;
+ sumX += x;
+ sumX2 += x * x;
+ sumY += y;
+ sumY2 += y * y;
+ double diff = x - y;
+ sumXYdiff2 += diff * diff;
+ count++;
+ }
+ if (compare <= 0) {
+ if (++xPrefIndex >= xLength) {
+ if (hasInferrer) {
+ // Must count other Ys; pretend next X is far away
+ if (yIndex == Long.MAX_VALUE) {
+ // ... but stop if both are done!
+ break;
+ }
+ xIndex = Long.MAX_VALUE;
+ } else {
+ break;
+ }
+ } else {
+ xIndex = xPrefs.getItemID(xPrefIndex);
+ }
+ }
+ if (compare >= 0) {
+ if (++yPrefIndex >= yLength) {
+ if (hasInferrer) {
+ // Must count other Xs; pretend next Y is far away
+ if (xIndex == Long.MAX_VALUE) {
+ // ... but stop if both are done!
+ break;
+ }
+ yIndex = Long.MAX_VALUE;
+ } else {
+ break;
+ }
+ } else {
+ yIndex = yPrefs.getItemID(yPrefIndex);
+ }
+ }
+ }
+
+ // "Center" the data. If my math is correct, this'll do it.
+ double result;
+ if (centerData) {
+ double meanX = sumX / count;
+ double meanY = sumY / count;
+ // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+ double centeredSumXY = sumXY - meanY * sumX;
+ // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+ double centeredSumX2 = sumX2 - meanX * sumX;
+ // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+ double centeredSumY2 = sumY2 - meanY * sumY;
+ result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+ } else {
+ result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+ }
+
+ if (!Double.isNaN(result)) {
+ result = normalizeWeightResult(result, count, cachedNumItems);
+ }
+ return result;
+ }
+
+ @Override
+ public final double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ PreferenceArray xPrefs = dataModel.getPreferencesForItem(itemID1);
+ PreferenceArray yPrefs = dataModel.getPreferencesForItem(itemID2);
+ int xLength = xPrefs.length();
+ int yLength = yPrefs.length();
+
+ if (xLength == 0 || yLength == 0) {
+ return Double.NaN;
+ }
+
+ long xIndex = xPrefs.getUserID(0);
+ long yIndex = yPrefs.getUserID(0);
+ int xPrefIndex = 0;
+ int yPrefIndex = 0;
+
+ double sumX = 0.0;
+ double sumX2 = 0.0;
+ double sumY = 0.0;
+ double sumY2 = 0.0;
+ double sumXY = 0.0;
+ double sumXYdiff2 = 0.0;
+ int count = 0;
+
+ // No, pref inferrers and transforms don't apply here. I think.
+
+ while (true) {
+ int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+ if (compare == 0) {
+ // Both users expressed a preference for the item
+ double x = xPrefs.getValue(xPrefIndex);
+ double y = yPrefs.getValue(yPrefIndex);
+ sumXY += x * y;
+ sumX += x;
+ sumX2 += x * x;
+ sumY += y;
+ sumY2 += y * y;
+ double diff = x - y;
+ sumXYdiff2 += diff * diff;
+ count++;
+ }
+ if (compare <= 0) {
+ if (++xPrefIndex == xLength) {
+ break;
+ }
+ xIndex = xPrefs.getUserID(xPrefIndex);
+ }
+ if (compare >= 0) {
+ if (++yPrefIndex == yLength) {
+ break;
+ }
+ yIndex = yPrefs.getUserID(yPrefIndex);
+ }
+ }
+
+ double result;
+ if (centerData) {
+ // See comments above on these computations
+ double n = (double) count;
+ double meanX = sumX / n;
+ double meanY = sumY / n;
+ // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
+ double centeredSumXY = sumXY - meanY * sumX;
+ // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
+ double centeredSumX2 = sumX2 - meanX * sumX;
+ // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
+ double centeredSumY2 = sumY2 - meanY * sumY;
+ result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
+ } else {
+ result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
+ }
+
+ if (!Double.isNaN(result)) {
+ result = normalizeWeightResult(result, count, cachedNumUsers);
+ }
+ return result;
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ final double normalizeWeightResult(double result, int count, int num) {
+ double normalizedResult = result;
+ if (weighted) {
+ double scaleFactor = 1.0 - (double) count / (double) (num + 1);
+ if (normalizedResult < 0.0) {
+ normalizedResult = -1.0 + scaleFactor * (1.0 + normalizedResult);
+ } else {
+ normalizedResult = 1.0 - scaleFactor * (1.0 - normalizedResult);
+ }
+ }
+ // Make sure the result is not accidentally a little outside [-1.0, 1.0] due to rounding:
+ if (normalizedResult < -1.0) {
+ normalizedResult = -1.0;
+ } else if (normalizedResult > 1.0) {
+ normalizedResult = 1.0;
+ }
+ return normalizedResult;
+ }
+
+ @Override
+ public final void refresh(Collection<Refreshable> alreadyRefreshed) {
+ super.refresh(alreadyRefreshed);
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ @Override
+ public final String toString() {
+ return this.getClass().getSimpleName() + "[dataModel:" + getDataModel() + ",inferrer:" + inferrer + ']';
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
new file mode 100644
index 0000000..7c655fe
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java
@@ -0,0 +1,85 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+
+/**
+ * <p>
+ * Implementations of this interface compute an inferred preference for a user and an item that the user has
+ * not expressed any preference for. This might be an average of other preferences scores from that user, for
+ * example. This technique is sometimes called "default voting".
+ * </p>
+ */
+public final class AveragingPreferenceInferrer implements PreferenceInferrer {
+
+ private static final Float ZERO = 0.0f;
+
+ private final DataModel dataModel;
+ private final Cache<Long,Float> averagePreferenceValue;
+
+ public AveragingPreferenceInferrer(DataModel dataModel) throws TasteException {
+ this.dataModel = dataModel;
+ Retriever<Long,Float> retriever = new PrefRetriever();
+ averagePreferenceValue = new Cache<>(retriever, dataModel.getNumUsers());
+ refresh(null);
+ }
+
+ @Override
+ public float inferPreference(long userID, long itemID) throws TasteException {
+ return averagePreferenceValue.get(userID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ averagePreferenceValue.clear();
+ }
+
+ private final class PrefRetriever implements Retriever<Long,Float> {
+
+ @Override
+ public Float get(Long key) throws TasteException {
+ PreferenceArray prefs = dataModel.getPreferencesFromUser(key);
+ int size = prefs.length();
+ if (size == 0) {
+ return ZERO;
+ }
+ RunningAverage average = new FullRunningAverage();
+ for (int i = 0; i < size; i++) {
+ average.addDatum(prefs.getValue(i));
+ }
+ return (float) average.getAverage();
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "AveragingPreferenceInferrer";
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
new file mode 100644
index 0000000..87aeae9
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.LongPair;
+import com.google.common.base.Preconditions;
+
+/**
+ * Caches the results from an underlying {@link ItemSimilarity} implementation.
+ */
+public final class CachingItemSimilarity implements ItemSimilarity {
+
+ private final ItemSimilarity similarity;
+ private final Cache<LongPair,Double> similarityCache;
+ private final RefreshHelper refreshHelper;
+
+ /**
+ * Creates this on top of the given {@link ItemSimilarity}.
+ * The cache is sized according to properties of the given {@link DataModel}.
+ */
+ public CachingItemSimilarity(ItemSimilarity similarity, DataModel dataModel) throws TasteException {
+ this(similarity, dataModel.getNumItems());
+ }
+
+ /**
+ * Creates this on top of the given {@link ItemSimilarity}.
+ * The cache size is capped by the given size.
+ */
+ public CachingItemSimilarity(ItemSimilarity similarity, int maxCacheSize) {
+ Preconditions.checkArgument(similarity != null, "similarity is null");
+ this.similarity = similarity;
+ this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize);
+ this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ similarityCache.clear();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(similarity);
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ LongPair key = itemID1 < itemID2 ? new LongPair(itemID1, itemID2) : new LongPair(itemID2, itemID1);
+ return similarityCache.get(key);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) throws TasteException {
+ return similarity.allSimilarItemIDs(itemID);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ public void clearCacheForItem(long itemID) {
+ similarityCache.removeKeysMatching(new LongPairMatchPredicate(itemID));
+ }
+
+ private static final class SimilarityRetriever implements Retriever<LongPair,Double> {
+ private final ItemSimilarity similarity;
+
+ private SimilarityRetriever(ItemSimilarity similarity) {
+ this.similarity = similarity;
+ }
+
+ @Override
+ public Double get(LongPair key) throws TasteException {
+ return similarity.itemSimilarity(key.getFirst(), key.getSecond());
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
new file mode 100644
index 0000000..873568a
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.concurrent.Callable;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.impl.common.Retriever;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.LongPair;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * Caches the results from an underlying {@link UserSimilarity} implementation.
+ */
+public final class CachingUserSimilarity implements UserSimilarity {
+
+ private final UserSimilarity similarity;
+ private final Cache<LongPair,Double> similarityCache;
+ private final RefreshHelper refreshHelper;
+
+ /**
+ * Creates this on top of the given {@link UserSimilarity}.
+ * The cache is sized according to properties of the given {@link DataModel}.
+ */
+ public CachingUserSimilarity(UserSimilarity similarity, DataModel dataModel) throws TasteException {
+ this(similarity, dataModel.getNumUsers());
+ }
+
+ /**
+ * Creates this on top of the given {@link UserSimilarity}.
+ * The cache size is capped by the given size.
+ */
+ public CachingUserSimilarity(UserSimilarity similarity, int maxCacheSize) {
+ Preconditions.checkArgument(similarity != null, "similarity is null");
+ this.similarity = similarity;
+ this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize);
+ this.refreshHelper = new RefreshHelper(new Callable<Void>() {
+ @Override
+ public Void call() {
+ similarityCache.clear();
+ return null;
+ }
+ });
+ refreshHelper.addDependency(similarity);
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ LongPair key = userID1 < userID2 ? new LongPair(userID1, userID2) : new LongPair(userID2, userID1);
+ return similarityCache.get(key);
+ }
+
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ similarityCache.clear();
+ similarity.setPreferenceInferrer(inferrer);
+ }
+
+ public void clearCacheForUser(long userID) {
+ similarityCache.removeKeysMatching(new LongPairMatchPredicate(userID));
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ refreshHelper.refresh(alreadyRefreshed);
+ }
+
+ private static final class SimilarityRetriever implements Retriever<LongPair,Double> {
+ private final UserSimilarity similarity;
+
+ private SimilarityRetriever(UserSimilarity similarity) {
+ this.similarity = similarity;
+ }
+
+ @Override
+ public Double get(LongPair key) throws TasteException {
+ return similarity.userSimilarity(key.getFirst(), key.getSecond());
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
new file mode 100644
index 0000000..88fbe58
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java
@@ -0,0 +1,98 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * Implementation of City Block distance (also known as Manhattan distance) - the absolute value of the difference of
+ * each direction is summed. The resulting unbounded distance is then mapped between 0 and 1.
+ */
+public final class CityBlockSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ public CityBlockSimilarity(DataModel dataModel) {
+ super(dataModel);
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(refreshed, getDataModel());
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ return doSimilarity(preferring1, preferring2, intersection);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ DataModel dataModel = getDataModel();
+ int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ double[] distance = new double[itemID2s.length];
+ for (int i = 0; i < itemID2s.length; ++i) {
+ int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2s[i]);
+ int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2s[i]);
+ distance[i] = doSimilarity(preferring1, preferring2, intersection);
+ }
+ return distance;
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
+ FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
+ int prefs1Size = prefs1.size();
+ int prefs2Size = prefs2.size();
+ int intersectionSize = prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2);
+ return doSimilarity(prefs1Size, prefs2Size, intersectionSize);
+ }
+
+ /**
+ * Calculate City Block Distance from total non-zero values and intersections and map to a similarity value.
+ *
+ * @param pref1 number of non-zero values in left vector
+ * @param pref2 number of non-zero values in right vector
+ * @param intersection number of overlapping non-zero values
+ */
+ private static double doSimilarity(int pref1, int pref2, int intersection) {
+ int distance = pref1 + pref2 - 2 * intersection;
+ return 1.0 / (1.0 + distance);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
new file mode 100644
index 0000000..990e9ea
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java
@@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking
+ * of items as dimensions and preferences as points along those dimensions, a distance is computed using all
+ * items (dimensions) where both users have expressed a preference for that item. This is simply the square
+ * root of the sum of the squares of differences in position (preference) along each dimension.</p>
+ *
+ * <p>The similarity could be computed as 1 / (1 + distance / sqrt(n)), so the resulting values are in the range (0,1].
+ * This would weight against pairs that overlap in more dimensions, which should indicate more similarity,
+ * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as
+ * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this.
+ * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).</p>
+ *
+ * <p>Note that this could cause a similarity to exceed 1; such values are capped at 1.</p>
+ *
+ * <p>Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from
+ * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as
+ * it doesn't change ordering.</p>
+ */
+public final class EuclideanDistanceSimilarity extends AbstractSimilarity {
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public EuclideanDistanceSimilarity(DataModel dataModel) throws TasteException {
+ this(dataModel, Weighting.UNWEIGHTED);
+ }
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public EuclideanDistanceSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+ super(dataModel, weighting, false);
+ Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+ }
+
+ @Override
+ double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+ return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
new file mode 100644
index 0000000..d0c9b8c
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java
@@ -0,0 +1,358 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.ItemSimilarity;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * A "generic" {@link ItemSimilarity} which takes a static list of precomputed item similarities and bases its
+ * responses on that alone. The values may have been precomputed offline by another process, stored in a file,
+ * and then read and fed into an instance of this class.
+ * </p>
+ *
+ * <p>
+ * This is perhaps the best {@link ItemSimilarity} to use with
+ * {@link org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender}, for now, since the point
+ * of item-based recommenders is that they can take advantage of the fact that item similarity is relatively
+ * static, can be precomputed, and then used in computation to gain a significant performance advantage.
+ * </p>
+ */
+public final class GenericItemSimilarity implements ItemSimilarity {
+
+ private static final long[] NO_IDS = new long[0];
+
+ private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>();
+ private final FastByIDMap<FastIDSet> similarItemIDsIndex = new FastByIDMap<>();
+
+ /**
+ * <p>
+ * Creates a {@link GenericItemSimilarity} from a precomputed list of {@link ItemItemSimilarity}s. Each
+ * represents the similarity between two distinct items. Since similarity is assumed to be symmetric, it is
+ * not necessary to specify similarity between item1 and item2, and item2 and item1. Both are the same. It
+ * is also not necessary to specify a similarity between any item and itself; these are assumed to be 1.0.
+ * </p>
+ *
+ * <p>
+ * Note that specifying a similarity between two items twice is not an error, but, the later value will win.
+ * </p>
+ *
+ * @param similarities
+ * set of {@link ItemItemSimilarity}s on which to base this instance
+ */
+ public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities) {
+ initSimilarityMaps(similarities.iterator());
+ }
+
+ /**
+ * <p>
+ * Like {@link #GenericItemSimilarity(Iterable)}, but will only keep the specified number of similarities
+ * from the given {@link Iterable} of similarities. It will keep those with the highest similarity -- those
+ * that are therefore most important.
+ * </p>
+ *
+ * <p>
+ * Thanks to tsmorton for suggesting this and providing part of the implementation.
+ * </p>
+ *
+ * @param similarities
+ * set of {@link ItemItemSimilarity}s on which to base this instance
+ * @param maxToKeep
+ * maximum number of similarities to keep
+ */
+ public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities, int maxToKeep) {
+ Iterable<ItemItemSimilarity> keptSimilarities =
+ TopItems.getTopItemItemSimilarities(maxToKeep, similarities.iterator());
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ /**
+ * <p>
+ * Builds a list of item-item similarities given an {@link ItemSimilarity} implementation and a
+ * {@link DataModel}, rather than a list of {@link ItemItemSimilarity}s.
+ * </p>
+ *
+ * <p>
+ * It's valid to build a {@link GenericItemSimilarity} this way, but perhaps missing some of the point of an
+ * item-based recommender. Item-based recommenders use the assumption that item-item similarities are
+ * relatively fixed, and might be known already independent of user preferences. Hence it is useful to
+ * inject that information, using {@link #GenericItemSimilarity(Iterable)}.
+ * </p>
+ *
+ * @param otherSimilarity
+ * other {@link ItemSimilarity} to get similarities from
+ * @param dataModel
+ * data model to get items from
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel} items
+ */
+ public GenericItemSimilarity(ItemSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
+ long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
+ initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, itemIDs));
+ }
+
+ /**
+ * <p>
+ * Like {@link #GenericItemSimilarity(ItemSimilarity, DataModel)} )}, but will only keep the specified
+ * number of similarities from the given {@link DataModel}. It will keep those with the highest similarity
+ * -- those that are therefore most important.
+ * </p>
+ *
+ * <p>
+ * Thanks to tsmorton for suggesting this and providing part of the implementation.
+ * </p>
+ *
+ * @param otherSimilarity
+ * other {@link ItemSimilarity} to get similarities from
+ * @param dataModel
+ * data model to get items from
+ * @param maxToKeep
+ * maximum number of similarities to keep
+ * @throws TasteException
+ * if an error occurs while accessing the {@link DataModel} items
+ */
+ public GenericItemSimilarity(ItemSimilarity otherSimilarity,
+ DataModel dataModel,
+ int maxToKeep) throws TasteException {
+ long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs());
+ Iterator<ItemItemSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, itemIDs);
+ Iterable<ItemItemSimilarity> keptSimilarities = TopItems.getTopItemItemSimilarities(maxToKeep, it);
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ private void initSimilarityMaps(Iterator<ItemItemSimilarity> similarities) {
+ while (similarities.hasNext()) {
+ ItemItemSimilarity iic = similarities.next();
+ long similarityItemID1 = iic.getItemID1();
+ long similarityItemID2 = iic.getItemID2();
+ if (similarityItemID1 != similarityItemID2) {
+ // Order them -- first key should be the "smaller" one
+ long itemID1;
+ long itemID2;
+ if (similarityItemID1 < similarityItemID2) {
+ itemID1 = similarityItemID1;
+ itemID2 = similarityItemID2;
+ } else {
+ itemID1 = similarityItemID2;
+ itemID2 = similarityItemID1;
+ }
+ FastByIDMap<Double> map = similarityMaps.get(itemID1);
+ if (map == null) {
+ map = new FastByIDMap<>();
+ similarityMaps.put(itemID1, map);
+ }
+ map.put(itemID2, iic.getValue());
+
+ doIndex(itemID1, itemID2);
+ doIndex(itemID2, itemID1);
+ }
+ // else similarity between item and itself already assumed to be 1.0
+ }
+ }
+
+ private void doIndex(long fromItemID, long toItemID) {
+ FastIDSet similarItemIDs = similarItemIDsIndex.get(fromItemID);
+ if (similarItemIDs == null) {
+ similarItemIDs = new FastIDSet();
+ similarItemIDsIndex.put(fromItemID, similarItemIDs);
+ }
+ similarItemIDs.add(toItemID);
+ }
+
+ /**
+ * <p>
+ * Returns the similarity between two items. Note that similarity is assumed to be symmetric, that
+ * {@code itemSimilarity(item1, item2) == itemSimilarity(item2, item1)}, and that
+ * {@code itemSimilarity(item1,item1) == 1.0} for all items.
+ * </p>
+ *
+ * @param itemID1
+ * first item
+ * @param itemID2
+ * second item
+ * @return similarity between the two
+ */
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) {
+ if (itemID1 == itemID2) {
+ return 1.0;
+ }
+ long firstID;
+ long secondID;
+ if (itemID1 < itemID2) {
+ firstID = itemID1;
+ secondID = itemID2;
+ } else {
+ firstID = itemID2;
+ secondID = itemID1;
+ }
+ FastByIDMap<Double> nextMap = similarityMaps.get(firstID);
+ if (nextMap == null) {
+ return Double.NaN;
+ }
+ Double similarity = nextMap.get(secondID);
+ return similarity == null ? Double.NaN : similarity;
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) {
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = itemSimilarity(itemID1, itemID2s[i]);
+ }
+ return result;
+ }
+
+ @Override
+ public long[] allSimilarItemIDs(long itemID) {
+ FastIDSet similarItemIDs = similarItemIDsIndex.get(itemID);
+ return similarItemIDs != null ? similarItemIDs.toArray() : NO_IDS;
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // Do nothing
+ }
+
+ /** Encapsulates a similarity between two items. Similarity must be in the range [-1.0,1.0]. */
+ public static final class ItemItemSimilarity implements Comparable<ItemItemSimilarity> {
+
+ private final long itemID1;
+ private final long itemID2;
+ private final double value;
+
+ /**
+ * @param itemID1
+ * first item
+ * @param itemID2
+ * second item
+ * @param value
+ * similarity between the two
+ * @throws IllegalArgumentException
+ * if value is NaN, less than -1.0 or greater than 1.0
+ */
+ public ItemItemSimilarity(long itemID1, long itemID2, double value) {
+ Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0");
+ this.itemID1 = itemID1;
+ this.itemID2 = itemID2;
+ this.value = value;
+ }
+
+ public long getItemID1() {
+ return itemID1;
+ }
+
+ public long getItemID2() {
+ return itemID2;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return "ItemItemSimilarity[" + itemID1 + ',' + itemID2 + ':' + value + ']';
+ }
+
+ /** Defines an ordering from highest similarity to lowest. */
+ @Override
+ public int compareTo(ItemItemSimilarity other) {
+ double otherValue = other.getValue();
+ return value > otherValue ? -1 : value < otherValue ? 1 : 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof ItemItemSimilarity)) {
+ return false;
+ }
+ ItemItemSimilarity otherSimilarity = (ItemItemSimilarity) other;
+ return otherSimilarity.getItemID1() == itemID1
+ && otherSimilarity.getItemID2() == itemID2
+ && otherSimilarity.getValue() == value;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) itemID1 ^ (int) itemID2 ^ RandomUtils.hashDouble(value);
+ }
+
+ }
+
+ private static final class DataModelSimilaritiesIterator extends AbstractIterator<ItemItemSimilarity> {
+
+ private final ItemSimilarity otherSimilarity;
+ private final long[] itemIDs;
+ private int i;
+ private long itemID1;
+ private int j;
+
+ private DataModelSimilaritiesIterator(ItemSimilarity otherSimilarity, long[] itemIDs) {
+ this.otherSimilarity = otherSimilarity;
+ this.itemIDs = itemIDs;
+ i = 0;
+ itemID1 = itemIDs[0];
+ j = 1;
+ }
+
+ @Override
+ protected ItemItemSimilarity computeNext() {
+ int size = itemIDs.length;
+ ItemItemSimilarity result = null;
+ while (result == null && i < size - 1) {
+ long itemID2 = itemIDs[j];
+ double similarity;
+ try {
+ similarity = otherSimilarity.itemSimilarity(itemID1, itemID2);
+ } catch (TasteException te) {
+ // ugly:
+ throw new IllegalStateException(te);
+ }
+ if (!Double.isNaN(similarity)) {
+ result = new ItemItemSimilarity(itemID1, itemID2, similarity);
+ }
+ if (++j == size) {
+ itemID1 = itemIDs[++i];
+ j = i + 1;
+ }
+ }
+ if (result == null) {
+ return endOfData();
+ } else {
+ return result;
+ }
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
new file mode 100644
index 0000000..1c221c2
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import com.google.common.collect.AbstractIterator;
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.recommender.TopItems;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.common.RandomUtils;
+
+import com.google.common.base.Preconditions;
+
+public final class GenericUserSimilarity implements UserSimilarity {
+
+ private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>();
+
+ public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities) {
+ initSimilarityMaps(similarities.iterator());
+ }
+
+ public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities, int maxToKeep) {
+ Iterable<UserUserSimilarity> keptSimilarities =
+ TopItems.getTopUserUserSimilarities(maxToKeep, similarities.iterator());
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ public GenericUserSimilarity(UserSimilarity otherSimilarity, DataModel dataModel) throws TasteException {
+ long[] userIDs = longIteratorToList(dataModel.getUserIDs());
+ initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, userIDs));
+ }
+
+ public GenericUserSimilarity(UserSimilarity otherSimilarity,
+ DataModel dataModel,
+ int maxToKeep) throws TasteException {
+ long[] userIDs = longIteratorToList(dataModel.getUserIDs());
+ Iterator<UserUserSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, userIDs);
+ Iterable<UserUserSimilarity> keptSimilarities = TopItems.getTopUserUserSimilarities(maxToKeep, it);
+ initSimilarityMaps(keptSimilarities.iterator());
+ }
+
+ static long[] longIteratorToList(LongPrimitiveIterator iterator) {
+ long[] result = new long[5];
+ int size = 0;
+ while (iterator.hasNext()) {
+ if (size == result.length) {
+ long[] newResult = new long[result.length << 1];
+ System.arraycopy(result, 0, newResult, 0, result.length);
+ result = newResult;
+ }
+ result[size++] = iterator.next();
+ }
+ if (size != result.length) {
+ long[] newResult = new long[size];
+ System.arraycopy(result, 0, newResult, 0, size);
+ result = newResult;
+ }
+ return result;
+ }
+
+ private void initSimilarityMaps(Iterator<UserUserSimilarity> similarities) {
+ while (similarities.hasNext()) {
+ UserUserSimilarity uuc = similarities.next();
+ long similarityUser1 = uuc.getUserID1();
+ long similarityUser2 = uuc.getUserID2();
+ if (similarityUser1 != similarityUser2) {
+ // Order them -- first key should be the "smaller" one
+ long user1;
+ long user2;
+ if (similarityUser1 < similarityUser2) {
+ user1 = similarityUser1;
+ user2 = similarityUser2;
+ } else {
+ user1 = similarityUser2;
+ user2 = similarityUser1;
+ }
+ FastByIDMap<Double> map = similarityMaps.get(user1);
+ if (map == null) {
+ map = new FastByIDMap<>();
+ similarityMaps.put(user1, map);
+ }
+ map.put(user2, uuc.getValue());
+ }
+ // else similarity between user and itself already assumed to be 1.0
+ }
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) {
+ if (userID1 == userID2) {
+ return 1.0;
+ }
+ long first;
+ long second;
+ if (userID1 < userID2) {
+ first = userID1;
+ second = userID2;
+ } else {
+ first = userID2;
+ second = userID1;
+ }
+ FastByIDMap<Double> nextMap = similarityMaps.get(first);
+ if (nextMap == null) {
+ return Double.NaN;
+ }
+ Double similarity = nextMap.get(second);
+ return similarity == null ? Double.NaN : similarity;
+ }
+
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ // Do nothing
+ }
+
+ public static final class UserUserSimilarity implements Comparable<UserUserSimilarity> {
+
+ private final long userID1;
+ private final long userID2;
+ private final double value;
+
+ public UserUserSimilarity(long userID1, long userID2, double value) {
+ Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0");
+ this.userID1 = userID1;
+ this.userID2 = userID2;
+ this.value = value;
+ }
+
+ public long getUserID1() {
+ return userID1;
+ }
+
+ public long getUserID2() {
+ return userID2;
+ }
+
+ public double getValue() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ return "UserUserSimilarity[" + userID1 + ',' + userID2 + ':' + value + ']';
+ }
+
+ /** Defines an ordering from highest similarity to lowest. */
+ @Override
+ public int compareTo(UserUserSimilarity other) {
+ double otherValue = other.getValue();
+ return value > otherValue ? -1 : value < otherValue ? 1 : 0;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (!(other instanceof UserUserSimilarity)) {
+ return false;
+ }
+ UserUserSimilarity otherSimilarity = (UserUserSimilarity) other;
+ return otherSimilarity.getUserID1() == userID1
+ && otherSimilarity.getUserID2() == userID2
+ && otherSimilarity.getValue() == value;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) userID1 ^ (int) userID2 ^ RandomUtils.hashDouble(value);
+ }
+
+ }
+
+ private static final class DataModelSimilaritiesIterator extends AbstractIterator<UserUserSimilarity> {
+
+ private final UserSimilarity otherSimilarity;
+ private final long[] itemIDs;
+ private int i;
+ private long itemID1;
+ private int j;
+
+ private DataModelSimilaritiesIterator(UserSimilarity otherSimilarity, long[] itemIDs) {
+ this.otherSimilarity = otherSimilarity;
+ this.itemIDs = itemIDs;
+ i = 0;
+ itemID1 = itemIDs[0];
+ j = 1;
+ }
+
+ @Override
+ protected UserUserSimilarity computeNext() {
+ int size = itemIDs.length;
+ while (i < size - 1) {
+ long itemID2 = itemIDs[j];
+ double similarity;
+ try {
+ similarity = otherSimilarity.userSimilarity(itemID1, itemID2);
+ } catch (TasteException te) {
+ // ugly:
+ throw new IllegalStateException(te);
+ }
+ if (!Double.isNaN(similarity)) {
+ return new UserUserSimilarity(itemID1, itemID2, similarity);
+ }
+ if (++j == size) {
+ itemID1 = itemIDs[++i];
+ j = i + 1;
+ }
+ }
+ return endOfData();
+ }
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
new file mode 100644
index 0000000..3084c8f
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+import org.apache.mahout.math.stats.LogLikelihood;
+
+/**
+ * See <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962">
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962</a> and
+ * <a href="http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html">
+ * http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html</a>.
+ */
+public final class LogLikelihoodSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ public LogLikelihoodSimilarity(DataModel dataModel) {
+ super(dataModel);
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+
+ DataModel dataModel = getDataModel();
+ FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1);
+ FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2);
+
+ long prefs1Size = prefs1.size();
+ long prefs2Size = prefs2.size();
+ long intersectionSize =
+ prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2);
+ if (intersectionSize == 0) {
+ return Double.NaN;
+ }
+ long numItems = dataModel.getNumItems();
+ double logLikelihood =
+ LogLikelihood.logLikelihoodRatio(intersectionSize,
+ prefs2Size - intersectionSize,
+ prefs1Size - intersectionSize,
+ numItems - prefs1Size - prefs2Size + intersectionSize);
+ return 1.0 - 1.0 / (1.0 + logLikelihood);
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ DataModel dataModel = getDataModel();
+ long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ long numUsers = dataModel.getNumUsers();
+ return doItemSimilarity(itemID1, itemID2, preferring1, numUsers);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ DataModel dataModel = getDataModel();
+ long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1);
+ long numUsers = dataModel.getNumUsers();
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1, numUsers);
+ }
+ return result;
+ }
+
+ private double doItemSimilarity(long itemID1, long itemID2, long preferring1, long numUsers) throws TasteException {
+ DataModel dataModel = getDataModel();
+ long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ if (preferring1and2 == 0) {
+ return Double.NaN;
+ }
+ long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ double logLikelihood =
+ LogLikelihood.logLikelihoodRatio(preferring1and2,
+ preferring2 - preferring1and2,
+ preferring1 - preferring1and2,
+ numUsers - preferring1 - preferring2 + preferring1and2);
+ return 1.0 - 1.0 / (1.0 + logLikelihood);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel());
+ }
+
+ @Override
+ public String toString() {
+ return "LogLikelihoodSimilarity[dataModel:" + getDataModel() + ']';
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
new file mode 100644
index 0000000..48dc4e0
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.impl.common.Cache;
+import org.apache.mahout.common.LongPair;
+
+/**
+ * A {@link Cache.MatchPredicate} which will match an ID against either element of a
+ * {@link LongPair}.
+ */
+final class LongPairMatchPredicate implements Cache.MatchPredicate<LongPair> {
+
+ private final long id;
+
+ LongPairMatchPredicate(long id) {
+ this.id = id;
+ }
+
+ @Override
+ public boolean matches(LongPair pair) {
+ return pair.getFirst() == id || pair.getSecond() == id;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
new file mode 100644
index 0000000..8ea1660
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of the Pearson correlation. For users X and Y, the following values are calculated:
+ * </p>
+ *
+ * <ul>
+ * <li>sumX2: sum of the square of all X's preference values</li>
+ * <li>sumY2: sum of the square of all Y's preference values</li>
+ * <li>sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a
+ * preference</li>
+ * </ul>
+ *
+ * <p>
+ * The correlation is then:
+ *
+ * <p>
+ * {@code sumXY / sqrt(sumX2 * sumY2)}
+ * </p>
+ *
+ * <p>
+ * Note that this correlation "centers" its data, shifts the user's preference values so that each of their
+ * means is 0. This is necessary to achieve expected behavior on all data sets.
+ * </p>
+ *
+ * <p>
+ * This correlation implementation is equivalent to the cosine similarity since the data it receives
+ * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle
+ * between the two vectors defined by the users' preference values.
+ * </p>
+ *
+ * <p>
+ * For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}.
+ * </p>
+ */
+public final class PearsonCorrelationSimilarity extends AbstractSimilarity {
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException {
+ this(dataModel, Weighting.UNWEIGHTED);
+ }
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+ super(dataModel, weighting, true);
+ Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+ }
+
+ @Override
+ double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+ if (n == 0) {
+ return Double.NaN;
+ }
+ // Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
+ // the data is assumed to be centered.
+ double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+ if (denominator == 0.0) {
+ // One or both parties has -all- the same ratings;
+ // can't really say much similarity under this measure
+ return Double.NaN;
+ }
+ return sumXY / denominator;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
new file mode 100644
index 0000000..1116368
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * Like {@link PearsonCorrelationSimilarity}, but compares relative ranking of preference values instead of
+ * preference values themselves. That is, each user's preferences are sorted and then assign a rank as their
+ * preference value, with 1 being assigned to the least preferred item.
+ * </p>
+ */
+public final class SpearmanCorrelationSimilarity implements UserSimilarity {
+
+ private final DataModel dataModel;
+
+ public SpearmanCorrelationSimilarity(DataModel dataModel) {
+ this.dataModel = Preconditions.checkNotNull(dataModel);
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+ PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
+ PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
+ int xLength = xPrefs.length();
+ int yLength = yPrefs.length();
+
+ if (xLength <= 1 || yLength <= 1) {
+ return Double.NaN;
+ }
+
+ // Copy prefs since we need to modify pref values to ranks
+ xPrefs = xPrefs.clone();
+ yPrefs = yPrefs.clone();
+
+ // First sort by values from low to high
+ xPrefs.sortByValue();
+ yPrefs.sortByValue();
+
+ // Assign ranks from low to high
+ float nextRank = 1.0f;
+ for (int i = 0; i < xLength; i++) {
+ // ... but only for items that are common to both pref arrays
+ if (yPrefs.hasPrefWithItemID(xPrefs.getItemID(i))) {
+ xPrefs.setValue(i, nextRank);
+ nextRank += 1.0f;
+ }
+ // Other values are bogus but don't matter
+ }
+ nextRank = 1.0f;
+ for (int i = 0; i < yLength; i++) {
+ if (xPrefs.hasPrefWithItemID(yPrefs.getItemID(i))) {
+ yPrefs.setValue(i, nextRank);
+ nextRank += 1.0f;
+ }
+ }
+
+ xPrefs.sortByItem();
+ yPrefs.sortByItem();
+
+ long xIndex = xPrefs.getItemID(0);
+ long yIndex = yPrefs.getItemID(0);
+ int xPrefIndex = 0;
+ int yPrefIndex = 0;
+
+ double sumXYRankDiff2 = 0.0;
+ int count = 0;
+
+ while (true) {
+ int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
+ if (compare == 0) {
+ double diff = xPrefs.getValue(xPrefIndex) - yPrefs.getValue(yPrefIndex);
+ sumXYRankDiff2 += diff * diff;
+ count++;
+ }
+ if (compare <= 0) {
+ if (++xPrefIndex >= xLength) {
+ break;
+ }
+ xIndex = xPrefs.getItemID(xPrefIndex);
+ }
+ if (compare >= 0) {
+ if (++yPrefIndex >= yLength) {
+ break;
+ }
+ yIndex = yPrefs.getItemID(yPrefIndex);
+ }
+ }
+
+ if (count <= 1) {
+ return Double.NaN;
+ }
+
+ // When ranks are unique, this formula actually gives the Pearson correlation
+ return 1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1));
+ }
+
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(alreadyRefreshed, dataModel);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
new file mode 100644
index 0000000..0c3a0a4
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java
@@ -0,0 +1,126 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import java.util.Collection;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
+import org.apache.mahout.cf.taste.model.DataModel;
+import org.apache.mahout.cf.taste.similarity.PreferenceInferrer;
+import org.apache.mahout.cf.taste.similarity.UserSimilarity;
+
+/**
+ * <p>
+ * An implementation of a "similarity" based on the <a
+ * href="http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_coefficient_.28extended_Jaccard_coefficient.29">
+ * Tanimoto coefficient</a>, or extended <a href="http://en.wikipedia.org/wiki/Jaccard_index">Jaccard
+ * coefficient</a>.
+ * </p>
+ *
+ * <p>
+ * This is intended for "binary" data sets where a user either expresses a generic "yes" preference for an
+ * item or has no preference. The actual preference values do not matter here, only their presence or absence.
+ * </p>
+ *
+ * <p>
+ * The value returned is in [0,1].
+ * </p>
+ */
+public final class TanimotoCoefficientSimilarity extends AbstractItemSimilarity implements UserSimilarity {
+
+ public TanimotoCoefficientSimilarity(DataModel dataModel) {
+ super(dataModel);
+ }
+
+ /**
+ * @throws UnsupportedOperationException
+ */
+ @Override
+ public void setPreferenceInferrer(PreferenceInferrer inferrer) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public double userSimilarity(long userID1, long userID2) throws TasteException {
+
+ DataModel dataModel = getDataModel();
+ FastIDSet xPrefs = dataModel.getItemIDsFromUser(userID1);
+ FastIDSet yPrefs = dataModel.getItemIDsFromUser(userID2);
+
+ int xPrefsSize = xPrefs.size();
+ int yPrefsSize = yPrefs.size();
+ if (xPrefsSize == 0 && yPrefsSize == 0) {
+ return Double.NaN;
+ }
+ if (xPrefsSize == 0 || yPrefsSize == 0) {
+ return 0.0;
+ }
+
+ int intersectionSize =
+ xPrefsSize < yPrefsSize ? yPrefs.intersectionSize(xPrefs) : xPrefs.intersectionSize(yPrefs);
+ if (intersectionSize == 0) {
+ return Double.NaN;
+ }
+
+ int unionSize = xPrefsSize + yPrefsSize - intersectionSize;
+
+ return (double) intersectionSize / (double) unionSize;
+ }
+
+ @Override
+ public double itemSimilarity(long itemID1, long itemID2) throws TasteException {
+ int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1);
+ return doItemSimilarity(itemID1, itemID2, preferring1);
+ }
+
+ @Override
+ public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException {
+ int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1);
+ int length = itemID2s.length;
+ double[] result = new double[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1);
+ }
+ return result;
+ }
+
+ private double doItemSimilarity(long itemID1, long itemID2, int preferring1) throws TasteException {
+ DataModel dataModel = getDataModel();
+ int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2);
+ if (preferring1and2 == 0) {
+ return Double.NaN;
+ }
+ int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2);
+ return (double) preferring1and2 / (double) (preferring1 + preferring2 - preferring1and2);
+ }
+
+ @Override
+ public void refresh(Collection<Refreshable> alreadyRefreshed) {
+ alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed);
+ RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel());
+ }
+
+ @Override
+ public String toString() {
+ return "TanimotoCoefficientSimilarity[dataModel:" + getDataModel() + ']';
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
new file mode 100644
index 0000000..6260606
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity;
+
+import org.apache.mahout.cf.taste.common.TasteException;
+import org.apache.mahout.cf.taste.common.Weighting;
+import org.apache.mahout.cf.taste.model.DataModel;
+
+import com.google.common.base.Preconditions;
+
+/**
+ * <p>
+ * An implementation of the cosine similarity. The result is the cosine of the angle formed between
+ * the two preference vectors.
+ * </p>
+ *
+ * <p>
+ * Note that this similarity does not "center" its data, shifts the user's preference values so that each of their
+ * means is 0. For this behavior, use {@link PearsonCorrelationSimilarity}, which actually is mathematically
+ * equivalent for centered data.
+ * </p>
+ */
+public final class UncenteredCosineSimilarity extends AbstractSimilarity {
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public UncenteredCosineSimilarity(DataModel dataModel) throws TasteException {
+ this(dataModel, Weighting.UNWEIGHTED);
+ }
+
+ /**
+ * @throws IllegalArgumentException if {@link DataModel} does not have preference values
+ */
+ public UncenteredCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
+ super(dataModel, weighting, false);
+ Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
+ }
+
+ @Override
+ double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
+ if (n == 0) {
+ return Double.NaN;
+ }
+ double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
+ if (denominator == 0.0) {
+ // One or both parties has -all- the same ratings;
+ // can't really say much similarity under this measure
+ return Double.NaN;
+ }
+ return sumXY / denominator;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
new file mode 100644
index 0000000..1ae45c2
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * {@link Iterable} to be able to read a file linewise into a {@link GenericItemSimilarity}
+ */
+final class FileItemItemSimilarityIterable implements Iterable<GenericItemSimilarity.ItemItemSimilarity> {
+
+ private final File similaritiesFile;
+
+ FileItemItemSimilarityIterable(File similaritiesFile) {
+ this.similaritiesFile = similaritiesFile;
+ }
+
+ @Override
+ public Iterator<GenericItemSimilarity.ItemItemSimilarity> iterator() {
+ try {
+ return new FileItemItemSimilarityIterator(similaritiesFile);
+ } catch (IOException ioe) {
+ throw new IllegalStateException("Can't read " + similaritiesFile, ioe);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
----------------------------------------------------------------------
diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
new file mode 100644
index 0000000..c071159
--- /dev/null
+++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.similarity.file;
+
+import com.google.common.base.Function;
+import com.google.common.collect.ForwardingIterator;
+import com.google.common.collect.Iterators;
+import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity;
+import org.apache.mahout.common.iterator.FileLineIterator;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.regex.Pattern;
+
+/**
+ * a simple iterator using a {@link FileLineIterator} internally, parsing each
+ * line into an {@link GenericItemSimilarity.ItemItemSimilarity}.
+ */
+final class FileItemItemSimilarityIterator extends ForwardingIterator<GenericItemSimilarity.ItemItemSimilarity> {
+
+ private static final Pattern SEPARATOR = Pattern.compile("[,\t]");
+
+ private final Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate;
+
+ FileItemItemSimilarityIterator(File similaritiesFile) throws IOException {
+ delegate = Iterators.transform(
+ new FileLineIterator(similaritiesFile),
+ new Function<String, GenericItemSimilarity.ItemItemSimilarity>() {
+ @Override
+ public GenericItemSimilarity.ItemItemSimilarity apply(String from) {
+ String[] tokens = SEPARATOR.split(from);
+ return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]),
+ Long.parseLong(tokens[1]),
+ Double.parseDouble(tokens[2]));
+ }
+ });
+ }
+
+ @Override
+ protected Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate() {
+ return delegate;
+ }
+
+}