You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2009/12/03 13:54:01 UTC

svn commit: r886766 - in /lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl: common/ model/file/ recommender/slopeone/ recommender/slopeone/file/

Author: srowen
Date: Thu Dec  3 12:54:00 2009
New Revision: 886766

URL: http://svn.apache.org/viewvc?rev=886766&view=rev
Log:
Added FileDiffStorage

Added:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java
      - copied, changed from r885376, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverage.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java
      - copied, changed from r885376, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverageAndStdDev.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java
Removed:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverage.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverageAndStdDev.java
Modified:
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
    lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java

Copied: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java (from r885376, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverage.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java?p2=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java&p1=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverage.java&r1=885376&r2=886766&rev=886766&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverage.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverage.java Thu Dec  3 12:54:00 2009
@@ -15,15 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.cf.taste.impl.recommender.slopeone;
+package org.apache.mahout.cf.taste.impl.common;
 
-import org.apache.mahout.cf.taste.impl.common.RunningAverage;
-
-final class InvertedRunningAverage implements RunningAverage {
+public final class InvertedRunningAverage implements RunningAverage {
 
   private final RunningAverage delegate;
 
-  InvertedRunningAverage(RunningAverage delegate) {
+  public InvertedRunningAverage(RunningAverage delegate) {
     this.delegate = delegate;
   }
 

Copied: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java (from r885376, lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverageAndStdDev.java)
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java?p2=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java&p1=lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverageAndStdDev.java&r1=885376&r2=886766&rev=886766&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/InvertedRunningAverageAndStdDev.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/common/InvertedRunningAverageAndStdDev.java Thu Dec  3 12:54:00 2009
@@ -15,15 +15,13 @@
  * limitations under the License.
  */
 
-package org.apache.mahout.cf.taste.impl.recommender.slopeone;
+package org.apache.mahout.cf.taste.impl.common;
 
-import org.apache.mahout.cf.taste.impl.common.RunningAverageAndStdDev;
-
-final class InvertedRunningAverageAndStdDev implements RunningAverageAndStdDev {
+public final class InvertedRunningAverageAndStdDev implements RunningAverageAndStdDev {
 
   private final RunningAverageAndStdDev delegate;
 
-  InvertedRunningAverageAndStdDev(RunningAverageAndStdDev delegate) {
+  public InvertedRunningAverageAndStdDev(RunningAverageAndStdDev delegate) {
     this.delegate = delegate;
   }
 

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=886766&r1=886765&r2=886766&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java Thu Dec  3 12:54:00 2009
@@ -145,7 +145,7 @@
       iterator.next();
       firstLine = iterator.peek();
     }
-    char delimiter = determineDelimiter(firstLine);
+    char delimiter = determineDelimiter(firstLine, 2);
     boolean hasPrefValues = firstLine.indexOf(delimiter, firstLine.indexOf(delimiter) + 1) >= 0;
 
     if (hasPrefValues) {
@@ -186,7 +186,7 @@
     return updateFiles;
   }
 
-  private static char determineDelimiter(String line) {
+  public static char determineDelimiter(String line, int maxDelimiters) {
     char delimiter;
     if (line.indexOf(',') >= 0) {
       delimiter = ',';
@@ -200,8 +200,8 @@
     int nextDelimiter;
     while ((nextDelimiter = line.indexOf(delimiter, lastDelimiter + 1)) >= 0) {
       delimiterCount++;
-      if (delimiterCount == 3) {
-        throw new IllegalArgumentException("More than two delimiters per line");
+      if (delimiterCount > maxDelimiters) {
+        throw new IllegalArgumentException("More than " + maxDelimiters + " delimiters per line");
       }
       if (nextDelimiter == lastDelimiter + 1) {
         // empty field

Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java?rev=886766&r1=886765&r2=886766&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/MemoryDiffStorage.java Thu Dec  3 12:54:00 2009
@@ -26,6 +26,8 @@
 import org.apache.mahout.cf.taste.impl.common.FastIDSet;
 import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
 import org.apache.mahout.cf.taste.impl.common.FullRunningAverageAndStdDev;
+import org.apache.mahout.cf.taste.impl.common.InvertedRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.InvertedRunningAverageAndStdDev;
 import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
 import org.apache.mahout.cf.taste.impl.common.RefreshHelper;
 import org.apache.mahout.cf.taste.impl.common.RunningAverage;
@@ -126,7 +128,13 @@
       itemID2 = temp;
     }
 
-    FastByIDMap<RunningAverage> level2Map = averageDiffs.get(itemID1);
+    FastByIDMap<RunningAverage> level2Map;
+    try {
+      buildAverageDiffsLock.readLock().lock();
+      level2Map = averageDiffs.get(itemID1);
+    } finally {
+      buildAverageDiffsLock.readLock().unlock();
+    }
     RunningAverage average = null;
     if (level2Map != null) {
       average = level2Map.get(itemID2);
@@ -200,7 +208,13 @@
 
   @Override
   public FastIDSet getRecommendableItemIDs(long userID) throws TasteException {
-    FastIDSet result = allRecommendableItemIDs.clone();
+    FastIDSet result;
+    try {
+      buildAverageDiffsLock.readLock().lock();
+      result = allRecommendableItemIDs.clone();
+    } finally {
+      buildAverageDiffsLock.readLock().unlock();
+    }
     Iterator<Long> it = result.iterator();
     while (it.hasNext()) {
       if (dataModel.getPreferenceValue(userID, it.next()) != null) {

Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java?rev=886766&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/slopeone/file/FileDiffStorage.java Thu Dec  3 12:54:00 2009
@@ -0,0 +1,319 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.impl.recommender.slopeone.file;
+
+import org.apache.mahout.cf.taste.common.Refreshable;
+import org.apache.mahout.cf.taste.impl.common.FastByIDMap;
+import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.FullRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.InvertedRunningAverage;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.RunningAverage;
+import org.apache.mahout.cf.taste.impl.model.file.FileDataModel;
+import org.apache.mahout.cf.taste.model.PreferenceArray;
+import org.apache.mahout.cf.taste.recommender.slopeone.DiffStorage;
+import org.apache.mahout.common.FileLineIterator;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.concurrent.locks.ReadWriteLock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+/**
+ * <p>{@link DiffStorage} which reads pre-computed diffs from a file and stores
+ * in memory. The file should have one diff per line:</p>
+ *
+ * {@code itemID1,itemID2,diff}
+ *
+ * <p>Commas or tabs can be delimiters. This is intended for use in conjuction
+ * with the output of
+ * {@link org.apache.mahout.cf.taste.hadoop.SlopeOneDiffsToAveragesJob}.</p>
+ */
+public final class FileDiffStorage implements DiffStorage {
+
+  private static final Logger log = LoggerFactory.getLogger(FileDiffStorage.class);
+
+  private static final long MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute?
+  private static final char COMMENT_CHAR = '#';
+
+  private final File dataFile;
+  private long lastModified;
+  private boolean loaded;
+  private final long maxEntries;
+  private final FastByIDMap<FastByIDMap<RunningAverage>> averageDiffs;
+  private final FastIDSet allRecommendableItemIDs;
+  private final ReadWriteLock buildAverageDiffsLock;
+
+  /**
+   * @param dataFile diffs file
+   * @param maxEntries maximum number of diffs to store
+   * @throws FileNotFoundException if data file does not exist or is a directory
+   */
+  public FileDiffStorage(File dataFile, long maxEntries) throws FileNotFoundException {
+    if (dataFile == null) {
+      throw new IllegalArgumentException("dataFile is null");
+    }
+    if (!dataFile.exists() || dataFile.isDirectory()) {
+      throw new FileNotFoundException(dataFile.toString());
+    }
+    if (maxEntries <= 0L) {
+      throw new IllegalArgumentException("maxEntries must be positive");
+    }
+
+    log.info("Creating FileDataModel for file " + dataFile);
+
+    this.dataFile = dataFile.getAbsoluteFile();
+    this.lastModified = dataFile.lastModified();
+    this.maxEntries = maxEntries;
+    this.averageDiffs = new FastByIDMap<FastByIDMap<RunningAverage>>();
+    this.allRecommendableItemIDs = new FastIDSet();
+    this.buildAverageDiffsLock = new ReentrantReadWriteLock();
+  }
+
+  private void buildDiffs() {
+    if (buildAverageDiffsLock.writeLock().tryLock()) {
+      try {
+
+        averageDiffs.clear();
+        allRecommendableItemIDs.clear();
+
+        FileLineIterator iterator = new FileLineIterator(dataFile, false);
+        String firstLine = iterator.peek();
+        while (firstLine.length() == 0 || firstLine.charAt(0) == COMMENT_CHAR) {
+          iterator.next();
+          firstLine = iterator.peek();
+        }
+        char delimiter = FileDataModel.determineDelimiter(firstLine, 2);
+        long averageCount = 0L;
+        while (iterator.hasNext()) {
+          averageCount = processLine(iterator.next(), delimiter, averageCount);
+        }
+
+        pruneInconsequentialDiffs();
+        updateAllRecommendableItems();
+
+      } catch (IOException ioe) {
+        log.warn("Exception while reloading", ioe);
+      } finally {
+        buildAverageDiffsLock.writeLock().unlock();
+      }
+    }
+  }
+
+  private long processLine(String line, char delimiter, long averageCount) {
+
+    if (line.length() == 0 || line.charAt(0) == COMMENT_CHAR) {
+      return averageCount;
+    }
+
+    int delimiterOne = line.indexOf((int) delimiter);
+    if (delimiterOne < 0) {
+      throw new IllegalArgumentException("Bad line: " + line);
+    }
+    int delimiterTwo = line.indexOf((int) delimiter, delimiterOne + 1);
+    if (delimiterTwo < 0) {
+      throw new IllegalArgumentException("Bad line: " + line);
+    }
+
+    long itemID1 = Long.parseLong(line.substring(0, delimiterOne));
+    long itemID2 = Long.parseLong(line.substring(delimiterOne + 1, delimiterTwo));
+    double diff = Double.parseDouble(line.substring(delimiterTwo + 1));
+
+    if (itemID1 > itemID2) {
+      long temp = itemID1;
+      itemID1 = itemID2;
+      itemID2 = temp;
+    }
+
+    FastByIDMap<RunningAverage> level1Map = averageDiffs.get(itemID1);
+    if (level1Map == null) {
+      level1Map = new FastByIDMap<RunningAverage>();
+      averageDiffs.put(itemID1, level1Map);
+    }
+    RunningAverage average = level1Map.get(itemID2);
+    if (average == null && averageCount < maxEntries) {
+      average = new FullRunningAverage();
+      level1Map.put(itemID2, average);
+      averageCount++;
+    }
+    if (average != null) {
+      average.addDatum(diff);
+    }
+
+    allRecommendableItemIDs.add(itemID1);
+    allRecommendableItemIDs.add(itemID2);
+
+    return averageCount;
+  }
+
+  private void pruneInconsequentialDiffs() {
+    // Go back and prune inconsequential diffs. "Inconsequential" means, here, only represented by one
+    // data point, so possibly unreliable
+    Iterator<Map.Entry<Long, FastByIDMap<RunningAverage>>> it1 = averageDiffs.entrySet().iterator();
+    while (it1.hasNext()) {
+      FastByIDMap<RunningAverage> map = it1.next().getValue();
+      Iterator<Map.Entry<Long, RunningAverage>> it2 = map.entrySet().iterator();
+      while (it2.hasNext()) {
+        RunningAverage average = it2.next().getValue();
+        if (average.getCount() <= 1) {
+          it2.remove();
+        }
+      }
+      if (map.isEmpty()) {
+        it1.remove();
+      } else {
+        map.rehash();
+      }
+    }
+    averageDiffs.rehash();
+  }
+
+  private void updateAllRecommendableItems() {
+    for (Map.Entry<Long, FastByIDMap<RunningAverage>> entry : averageDiffs.entrySet()) {
+      allRecommendableItemIDs.add(entry.getKey());
+      LongPrimitiveIterator it = entry.getValue().keySetIterator();
+      while (it.hasNext()) {
+        allRecommendableItemIDs.add(it.next());
+      }
+    }
+    allRecommendableItemIDs.rehash();
+  }
+
+  private void checkLoaded() {
+    if (!loaded) {
+      buildDiffs();
+      loaded = true;
+    }
+  }
+
+  @Override
+  public RunningAverage getDiff(long itemID1, long itemID2) {
+    checkLoaded();
+
+    boolean inverted = false;
+    if (itemID1 > itemID2) {
+      inverted = true;
+      long temp = itemID1;
+      itemID1 = itemID2;
+      itemID2 = temp;
+    }
+
+    FastByIDMap<RunningAverage> level2Map;
+    try {
+      buildAverageDiffsLock.readLock().lock();
+      level2Map = averageDiffs.get(itemID1);
+    } finally {
+      buildAverageDiffsLock.readLock().unlock();
+    }
+    RunningAverage average = null;
+    if (level2Map != null) {
+      average = level2Map.get(itemID2);
+    }
+    if (inverted) {
+      if (average == null) {
+        return null;
+      }
+      return new InvertedRunningAverage(average);
+    } else {
+      return average;
+    }
+  }
+
+  @Override
+  public RunningAverage[] getDiffs(long userID, long itemID, PreferenceArray prefs) {
+    checkLoaded();
+    try {
+      buildAverageDiffsLock.readLock().lock();
+      int size = prefs.length();
+      RunningAverage[] result = new RunningAverage[size];
+      for (int i = 0; i < size; i++) {
+        result[i] = getDiff(prefs.getItemID(i), itemID);
+      }
+      return result;
+    } finally {
+      buildAverageDiffsLock.readLock().unlock();
+    }
+  }
+
+  @Override
+  public RunningAverage getAverageItemPref(long itemID) {
+    checkLoaded();
+    return null; // TODO can't do this without a DataModel
+  }
+
+  @Override
+  public void updateItemPref(long itemID, float prefDelta, boolean remove) {
+    checkLoaded();
+    try {
+      buildAverageDiffsLock.readLock().lock();
+      for (Map.Entry<Long, FastByIDMap<RunningAverage>> entry : averageDiffs.entrySet()) {
+        boolean matchesItemID1 = itemID == entry.getKey();
+        for (Map.Entry<Long, RunningAverage> entry2 : entry.getValue().entrySet()) {
+          RunningAverage average = entry2.getValue();
+          if (matchesItemID1) {
+            if (remove) {
+              average.removeDatum(prefDelta);
+            } else {
+              average.changeDatum(-prefDelta);
+            }
+          } else if (itemID == entry2.getKey()) {
+            if (remove) {
+              average.removeDatum(-prefDelta);
+            } else {
+              average.changeDatum(prefDelta);
+            }
+          }
+        }
+      }
+      //RunningAverage itemAverage = averageItemPref.get(itemID);
+      //if (itemAverage != null) {
+      //  itemAverage.changeDatum(prefDelta);
+      //}
+    } finally {
+      buildAverageDiffsLock.readLock().unlock();
+    }
+  }
+
+  @Override
+  public FastIDSet getRecommendableItemIDs(long userID) {
+    checkLoaded();
+    try {
+      buildAverageDiffsLock.readLock().lock();
+      return allRecommendableItemIDs.clone();
+    } finally {
+      buildAverageDiffsLock.readLock().unlock();
+    }
+  }
+
+  @Override
+  public void refresh(Collection<Refreshable> alreadyRefreshed) {
+    long mostRecentModification = dataFile.lastModified();
+    if (mostRecentModification > lastModified + MIN_RELOAD_INTERVAL_MS) {
+      log.debug("File has changed; reloading...");
+      lastModified = mostRecentModification;
+      buildDiffs();
+    }
+  }
+
+}