You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2011/09/09 13:49:49 UTC

svn commit: r1167115 - in /mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/hadoop/item/ main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/ test/java/org/apache/mahou...

Author: ssc
Date: Fri Sep  9 11:49:49 2011
New Revision: 1167115

URL: http://svn.apache.org/viewvc?rev=1167115&view=rev
Log:
MAHOUT-767 Improve RowSimilarityJob performance, threshold integration

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/MostSimilarItemPairsMapper.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/MostSimilarItemPairsReducer.java
    mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java Fri Sep  9 11:49:49 2011
@@ -62,9 +62,9 @@ import java.util.regex.Pattern;
  * <p>Command line arguments specific to this class are:</p>
  *
  * <ol>
- * <li>-Dmapred.input.dir=(path): Directory containing one or more text files with the preference data</li>
- * <li>-Dmapred.output.dir=(path): output path where recommender output should go</li>
- * <li>--similarityClassname (classname): Name of distributed similarity class to instantiate or a predefined similarity
+ * <li>--input(path): Directory containing one or more text files with the preference data</li>
+ * <li>--output(path): output path where recommender output should go</li>
+ * <li>--similarityClassname (classname): Name of vector similarity class to instantiate or a predefined similarity
  *  from {@link org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure}</li>
  * <li>--usersFile (path): only compute recommendations for user IDs contained in this file (optional)</li>
  * <li>--itemsFile (path): only include item IDs from this file in the recommendations (optional)</li>
@@ -72,10 +72,12 @@ import java.util.regex.Pattern;
  * recommendations for that user (optional)</li>
  * <li>--numRecommendations (integer): Number of recommendations to compute per user (10)</li>
  * <li>--booleanData (boolean): Treat input data as having no pref values (false)</li>
- * <li>--maxPrefsPerUser (integer): Maximum number of preferences considered per user in
- *  final recommendation phase (10)</li>
+ * <li>--maxPrefsPerUser (integer): Maximum number of preferences considered per user in  final recommendation phase (10)</li>
  * <li>--maxSimilaritiesPerItem (integer): Maximum number of similarities considered per item (100)</li>
- * <li>--maxCooccurrencesPerItem (integer): Maximum number of cooccurrences considered per item (100)</li>
+ * <li>--minPrefsPerUser (integer): ignore users with less preferences than this in the similarity computation (1)</li>
+ * <li>--maxPrefsPerUserInItemSimilarity (integer): max number of preferences to consider per user in the item similarity computation phase,
+ * users with more preferences will be sampled down (1000)</li>
+ * <li>--threshold (double): discard item pairs with a similarity value below this</li>
  * </ol>
  *
  * <p>General command line options are documented in {@link AbstractJob}.</p>
@@ -115,6 +117,7 @@ public final class RecommenderJob extend
         DEFAULT_MAX_PREFS_PER_USER + ")", String.valueOf(DEFAULT_MAX_PREFS_PER_USER));
     addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " +
         "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')');
+    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
 
     Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
@@ -132,6 +135,9 @@ public final class RecommenderJob extend
     int maxPrefsPerUserInItemSimilarity = Integer.parseInt(parsedArgs.get("--maxPrefsPerUserInItemSimilarity"));
     int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
     String similarityClassname = parsedArgs.get("--similarityClassname");
+    double threshold = parsedArgs.containsKey("--threshold") ?
+        Double.parseDouble(parsedArgs.get("--threshold")) : RowSimilarityJob.NO_THRESHOLD;
+
 
     Path prepPath = getTempPath("preparePreferenceMatrix");
     Path similarityMatrixPath = getTempPath("similarityMatrix");
@@ -172,7 +178,9 @@ public final class RecommenderJob extend
         "--output", similarityMatrixPath.toString(),
         "--numberOfColumns", String.valueOf(numberOfUsers),
         "--similarityClassname", similarityClassname,
-        "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1),
+        "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem),
+        "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+        "--threshold", String.valueOf(threshold),
         "--tempDir", getTempPath().toString() });
     }
 

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Fri Sep  9 11:49:49 2011
@@ -17,23 +17,33 @@
 
 package org.apache.mahout.cf.taste.hadoop.similarity.item;
 
+import java.io.IOException;
+import java.util.Iterator;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import com.google.common.base.Preconditions;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
 
 import org.apache.hadoop.io.DoubleWritable;
+import org.apache.hadoop.io.IntWritable;
 import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.Reducer;
 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.common.TopK;
 import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
 import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
 import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
 import org.apache.mahout.common.AbstractJob;
+import org.apache.mahout.math.Vector;
+import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.hadoop.similarity.cooccurrence.RowSimilarityJob;
 import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
+import org.apache.mahout.math.map.OpenIntLongHashMap;
 
 /**
  * <p>Distributed precomputation of the item-item-similarities for Itembased Collaborative Filtering</p>
@@ -95,6 +105,7 @@ public final class ItemSimilarityJob ext
     addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this "
         + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
     addOption("booleanData", "b", "Treat input as without pref values", String.valueOf(Boolean.FALSE));
+    addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
 
     Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
@@ -107,50 +118,102 @@ public final class ItemSimilarityJob ext
     int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
     boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
 
+    double threshold = parsedArgs.containsKey("--threshold") ?
+        Double.parseDouble(parsedArgs.get("--threshold")) : RowSimilarityJob.NO_THRESHOLD;
+
     Path similarityMatrixPath = getTempPath("similarityMatrix");
     Path prepPath = getTempPath("prepareRatingMatrix");
 
     AtomicInteger currentPhase = new AtomicInteger();
 
-    ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
-        "--input", getInputPath().toString(),
-        "--output", prepPath.toString(),
-        "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser),
-        "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
-        "--booleanData", String.valueOf(booleanData),
-        "--tempDir", getTempPath().toString()});
-
-    int numberOfUsers = TasteHadoopUtils.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS), getConf());
-
-    /* Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like
-     * new DistributedRowMatrix(...).rowSimilarity(...) */
-    ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
-        "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
-        "--output", similarityMatrixPath.toString(),
-        "--numberOfColumns", String.valueOf(numberOfUsers),
-        "--similarityClassname", similarityClassName,
-        "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1),
-        "--tempDir", getTempPath().toString() });
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      ToolRunner.run(getConf(), new PreparePreferenceMatrixJob(), new String[]{
+          "--input", getInputPath().toString(),
+          "--output", prepPath.toString(),
+          "--maxPrefsPerUser", String.valueOf(maxPrefsPerUser),
+          "--minPrefsPerUser", String.valueOf(minPrefsPerUser),
+          "--booleanData", String.valueOf(booleanData),
+          "--tempDir", getTempPath().toString() });
+    }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      Job mostSimilarItems = prepareJob(similarityMatrixPath,
-                                  getOutputPath(),
-                                  SequenceFileInputFormat.class,
-                                  MostSimilarItemPairsMapper.class,
-                                  EntityEntityWritable.class,
-                                  DoubleWritable.class,
-                                  MostSimilarItemPairsReducer.class,
-                                  EntityEntityWritable.class,
-                                  DoubleWritable.class,
-                                  TextOutputFormat.class);
+      int numberOfUsers = TasteHadoopUtils.readInt(new Path(prepPath, PreparePreferenceMatrixJob.NUM_USERS),
+          getConf());
+
+      ToolRunner.run(getConf(), new RowSimilarityJob(), new String[] {
+          "--input", new Path(prepPath, PreparePreferenceMatrixJob.RATING_MATRIX).toString(),
+          "--output", similarityMatrixPath.toString(),
+          "--numberOfColumns", String.valueOf(numberOfUsers),
+          "--similarityClassname", similarityClassName,
+          "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem),
+          "--excludeSelfSimilarity", String.valueOf(Boolean.TRUE),
+          "--threshold", String.valueOf(threshold),
+          "--tempDir", getTempPath().toString() });
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job mostSimilarItems = prepareJob(similarityMatrixPath, getOutputPath(), SequenceFileInputFormat.class,
+          MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class,
+          MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
       Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration();
       mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR,
           new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
       mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem);
-      mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class);
       mostSimilarItems.waitForCompletion(true);
     }
 
     return 0;
   }
+
+  public static class MostSimilarItemPairsMapper
+      extends Mapper<IntWritable,VectorWritable,EntityEntityWritable,DoubleWritable> {
+
+    private OpenIntLongHashMap indexItemIDMap;
+    private int maxSimilarItemsPerItem;
+
+    @Override
+    protected void setup(Context ctx) {
+      Configuration conf = ctx.getConfiguration();
+      maxSimilarItemsPerItem = conf.getInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, -1);
+      indexItemIDMap = TasteHadoopUtils.readItemIDIndexMap(conf.get(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR), conf);
+
+      Preconditions.checkArgument(maxSimilarItemsPerItem > 0, "maxSimilarItemsPerItem was not correctly set!");
+    }
+
+    @Override
+    protected void map(IntWritable itemIDIndexWritable, VectorWritable similarityVector, Context ctx)
+      throws IOException, InterruptedException {
+
+      int itemIDIndex = itemIDIndexWritable.get();
+
+      TopK<SimilarItem> topKMostSimilarItems =
+          new TopK<SimilarItem>(maxSimilarItemsPerItem, SimilarItem.COMPARE_BY_SIMILARITY);
+
+      Iterator<Vector.Element> similarityVectorIterator = similarityVector.get().iterateNonZero();
+
+      while (similarityVectorIterator.hasNext()) {
+        Vector.Element element = similarityVectorIterator.next();
+        topKMostSimilarItems.offer(new SimilarItem(indexItemIDMap.get(element.index()), element.get()));
+      }
+
+      long itemID = indexItemIDMap.get(itemIDIndex);
+      for (SimilarItem similarItem : topKMostSimilarItems.retrieve()) {
+        long otherItemID = similarItem.getItemID();
+        if (itemID < otherItemID) {
+          ctx.write(new EntityEntityWritable(itemID, otherItemID), new DoubleWritable(similarItem.getSimilarity()));
+        } else {
+          ctx.write(new EntityEntityWritable(otherItemID, itemID), new DoubleWritable(similarItem.getSimilarity()));
+        }
+      }
+    }
+  }
+
+  static class MostSimilarItemPairsReducer
+      extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable> {
+    @Override
+    protected void reduce(EntityEntityWritable pair, Iterable<DoubleWritable> values, Context ctx)
+        throws IOException, InterruptedException {
+      ctx.write(pair, values.iterator().next());
+    }
+  }
 }

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/math/hadoop/similarity/cooccurrence/RowSimilarityJob.java Fri Sep  9 11:49:49 2011
@@ -35,7 +35,6 @@ import org.apache.mahout.math.Vector;
 import org.apache.mahout.math.VectorWritable;
 import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasures;
 import org.apache.mahout.math.hadoop.similarity.cooccurrence.measures.VectorSimilarityMeasure;
-import org.apache.mahout.math.map.OpenIntDoubleHashMap;
 import org.apache.mahout.math.map.OpenIntIntHashMap;
 
 import java.io.IOException;
@@ -47,18 +46,19 @@ import java.util.concurrent.atomic.Atomi
 
 public class RowSimilarityJob extends AbstractJob {
 
+  public static final double NO_THRESHOLD = Double.MIN_VALUE;
+
   static final String SIMILARITY_CLASSNAME = RowSimilarityJob.class + ".distributedSimilarityClassname";
   static final String NUMBER_OF_COLUMNS = RowSimilarityJob.class + ".numberOfColumns";
   static final String MAX_SIMILARITIES_PER_ROW = RowSimilarityJob.class + ".maxSimilaritiesPerRow";
   static final String EXCLUDE_SELF_SIMILARITY = RowSimilarityJob.class + ".excludeSelfSimilarity";
-  static final String THRESHOLD = RowSimilarityJob.class + ".threshold";
 
+  static final String THRESHOLD = RowSimilarityJob.class + ".threshold";
   static final String NORMS_PATH = RowSimilarityJob.class + ".normsPath";
   static final String MAXVALUES_PATH = RowSimilarityJob.class + ".maxWeightsPath";
-  static final String NUM_NON_ZERO_ENTRIES_PATH = RowSimilarityJob.class + ".nonZeroEntriesPath";
 
+  static final String NUM_NON_ZERO_ENTRIES_PATH = RowSimilarityJob.class + ".nonZeroEntriesPath";
   private static final int DEFAULT_MAX_SIMILARITIES_PER_ROW = 100;
-  private static final double NO_THRESHOLD = Double.MIN_VALUE;
 
   private static final int NORM_VECTOR_MARKER = Integer.MIN_VALUE;
   private static final int MAXVALUE_VECTOR_MARKER = Integer.MIN_VALUE + 1;
@@ -70,6 +70,7 @@ public class RowSimilarityJob extends Ab
     ToolRunner.run(new RowSimilarityJob(), args);
   }
 
+
   @Override
   public int run(String[] args) throws Exception {
 
@@ -81,7 +82,7 @@ public class RowSimilarityJob extends Ab
     addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: "
         + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW));
     addOption("excludeSelfSimilarity", "ess", "compute similarity of rows to themselves?", String.valueOf(false));
-    addOption("threshold", "tr", "drop row pairs with a similarity value below this");
+    addOption("threshold", "tr", "discard row pairs with a similarity value below this", false);
 
     Map<String,String> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?rev=1167115&r1=1167114&r2=1167115&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java Fri Sep  9 11:49:49 2011
@@ -47,7 +47,7 @@ import org.junit.Test;
 public final class ItemSimilarityJobTest extends TasteTestCase {
 
   /**
-   * Tests {@link MostSimilarItemPairsMapper}
+   * Tests {@link ItemSimilarityJob.MostSimilarItemPairsMapper}
    */
   @Test
   public void testMostSimilarItemsPairsMapper() throws Exception {
@@ -66,10 +66,9 @@ public final class ItemSimilarityJobTest
 
     Vector vector = new RandomAccessSparseVector(Integer.MAX_VALUE);
     vector.set(12, 0.2);
-    vector.set(34, 1.0);
     vector.set(56, 0.9);
 
-    MostSimilarItemPairsMapper mapper = new MostSimilarItemPairsMapper();
+    ItemSimilarityJob.MostSimilarItemPairsMapper mapper = new ItemSimilarityJob.MostSimilarItemPairsMapper();
     setField(mapper, "indexItemIDMap", indexItemIDMap);
     setField(mapper, "maxSimilarItemsPerItem", 1);
 
@@ -79,7 +78,7 @@ public final class ItemSimilarityJobTest
   }
 
   /**
-   * Tests {@link MostSimilarItemPairsReducer}
+   * Tests {@link ItemSimilarityJob.MostSimilarItemPairsReducer}
    */
   @Test
   public void testMostSimilarItemPairsReducer() throws Exception {
@@ -90,7 +89,7 @@ public final class ItemSimilarityJobTest
 
     EasyMock.replay(context);
 
-    new MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L),
+    new ItemSimilarityJob.MostSimilarItemPairsReducer().reduce(new EntityEntityWritable(123L, 456L),
         Arrays.asList(new DoubleWritable(0.5), new DoubleWritable(0.5)), context);
 
     EasyMock.verify(context);