You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2013/01/18 11:57:36 UTC

svn commit: r1435084 - in /mahout/trunk/core/src: main/java/org/apache/mahout/cf/taste/hadoop/item/ main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ test/java/org/apache/mahout/cf/taste/hadoop/item/

Author: ssc
Date: Fri Jan 18 10:57:36 2013
New Revision: 1435084

URL: http://svn.apache.org/viewvc?rev=1435084&view=rev
Log:
MAHOUT-609 Add an option to make RecommenderJob write out it's computed item similarities

Modified:
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
    mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
    mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1435084&r1=1435083&r2=1435084&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java Fri Jan 18 10:57:36 2013
@@ -19,6 +19,7 @@ package org.apache.mahout.cf.taste.hadoo
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.DoubleWritable;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.Mapper;
@@ -27,8 +28,10 @@ import org.apache.hadoop.mapreduce.lib.i
 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
 import org.apache.hadoop.util.ToolRunner;
+import org.apache.mahout.cf.taste.hadoop.EntityEntityWritable;
 import org.apache.mahout.cf.taste.hadoop.RecommendedItemsWritable;
 import org.apache.mahout.cf.taste.hadoop.preparation.PreparePreferenceMatrixJob;
+import org.apache.mahout.cf.taste.hadoop.similarity.item.ItemSimilarityJob;
 import org.apache.mahout.common.AbstractJob;
 import org.apache.mahout.common.HadoopUtil;
 import org.apache.mahout.common.iterator.sequencefile.PathType;
@@ -119,6 +122,8 @@ public final class RecommenderJob extend
     addOption("similarityClassname", "s", "Name of distributed similarity measures class to instantiate, " 
             + "alternatively use one of the predefined similarities (" + VectorSimilarityMeasures.list() + ')', true);
     addOption("threshold", "tr", "discard item pairs with a similarity value below this", false);
+    addOption("outputPathForSimilarityMatrix", "opfsm", "write the item similarity matrix to this path (optional)",
+        false);
 
     Map<String, List<String>> parsedArgs = parseArguments(args);
     if (parsedArgs == null) {
@@ -186,6 +191,22 @@ public final class RecommenderJob extend
         "--threshold", String.valueOf(threshold),
         "--tempDir", getTempPath().toString(),
       });
+
+      // write out the similarity matrix if the user specified that behavior
+      if (hasOption("outputPathForSimilarityMatrix")) {
+        Path outputPathForSimilarityMatrix = new Path(getOption("outputPathForSimilarityMatrix"));
+
+        Job outputSimilarityMatrix = prepareJob(similarityMatrixPath, outputPathForSimilarityMatrix,
+            SequenceFileInputFormat.class, ItemSimilarityJob.MostSimilarItemPairsMapper.class,
+            EntityEntityWritable.class, DoubleWritable.class, ItemSimilarityJob.MostSimilarItemPairsReducer.class,
+            EntityEntityWritable.class, DoubleWritable.class, TextOutputFormat.class);
+
+        Configuration mostSimilarItemsConf = outputSimilarityMatrix.getConfiguration();
+        mostSimilarItemsConf.set(ItemSimilarityJob.ITEM_ID_INDEX_PATH_STR,
+            new Path(prepPath, PreparePreferenceMatrixJob.ITEMID_INDEX).toString());
+        mostSimilarItemsConf.setInt(ItemSimilarityJob.MAX_SIMILARITIES_PER_ITEM, maxSimilaritiesPerItem);
+        outputSimilarityMatrix.waitForCompletion(true);
+      }
     }
 
     //start the multiplication of the co-occurrence matrix by the user vectors

Modified: mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1435084&r1=1435083&r2=1435084&view=diff
==============================================================================
--- mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java (original)
+++ mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java Fri Jan 18 10:57:36 2013
@@ -80,8 +80,8 @@ import org.apache.mahout.math.map.OpenIn
  */
 public final class ItemSimilarityJob extends AbstractJob {
 
-  static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr";
-  static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem";
+  public static final String ITEM_ID_INDEX_PATH_STR = ItemSimilarityJob.class.getName() + ".itemIDIndexPathStr";
+  public static final String MAX_SIMILARITIES_PER_ITEM = ItemSimilarityJob.class.getName() + ".maxSimilarItemsPerItem";
 
   private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
   private static final int DEFAULT_MAX_PREFS_PER_USER = 1000;
@@ -215,7 +215,7 @@ public final class ItemSimilarityJob ext
     }
   }
 
-  static class MostSimilarItemPairsReducer
+  public static class MostSimilarItemPairsReducer
       extends Reducer<EntityEntityWritable,DoubleWritable,EntityEntityWritable,DoubleWritable> {
     @Override
     protected void reduce(EntityEntityWritable pair, Iterable<DoubleWritable> values, Context ctx)

Modified: mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java
URL: http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java?rev=1435084&r1=1435083&r2=1435084&view=diff
==============================================================================
--- mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java (original)
+++ mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJobTest.java Fri Jan 18 10:57:36 2013
@@ -42,6 +42,7 @@ import org.apache.mahout.cf.taste.impl.T
 import org.apache.mahout.cf.taste.impl.common.FastIDSet;
 import org.apache.mahout.cf.taste.impl.recommender.GenericRecommendedItem;
 import org.apache.mahout.cf.taste.recommender.RecommendedItem;
+import org.apache.mahout.common.Pair;
 import org.apache.mahout.common.iterator.FileLineIterable;
 import org.apache.mahout.math.RandomAccessSparseVector;
 import org.apache.mahout.math.VarIntWritable;
@@ -704,6 +705,8 @@ public class RecommenderJobTest extends 
     File inputFile = getTestTempFile("prefs.txt");
     File outputDir = getTestTempDir("output");
     outputDir.delete();
+    File similaritiesOutputDir = getTestTempDir("outputSimilarities");
+    similaritiesOutputDir.delete();
     File tmpDir = getTestTempDir("tmp");
 
     writeLines(inputFile,
@@ -728,10 +731,10 @@ public class RecommenderJobTest extends 
     recommenderJob.setConf(conf);
 
     recommenderJob.run(new String[] { "--tempDir", tmpDir.getAbsolutePath(), "--similarityClassname",
-       TanimotoCoefficientSimilarity.class.getName(), "--numRecommendations", "4" });
+       TanimotoCoefficientSimilarity.class.getName(), "--numRecommendations", "4",
+        "--outputPathForSimilarityMatrix", similaritiesOutputDir.getAbsolutePath() });
 
     Map<Long,List<RecommendedItem>> recommendations = readRecommendations(new File(outputDir, "part-r-00000"));
-
     assertEquals(4, recommendations.size());
 
     for (Entry<Long,List<RecommendedItem>> entry : recommendations.entrySet()) {
@@ -767,6 +770,16 @@ public class RecommenderJobTest extends 
         assertEquals(3.5, item2.getValue(), 0.05);
       }
     }
+
+    Map<Pair<Long, Long>, Double> similarities = readSimilarities(new File(similaritiesOutputDir, "part-r-00000"));
+    assertEquals(6, similarities.size());
+
+    assertEquals(0.25, similarities.get(new Pair<Long, Long>(1l, 2l)), EPSILON);
+    assertEquals(0.6666666666666666, similarities.get(new Pair<Long, Long>(1l, 3l)), EPSILON);
+    assertEquals(0.5, similarities.get(new Pair<Long, Long>(1l, 4l)), EPSILON);
+    assertEquals(0.3333333333333333, similarities.get(new Pair<Long, Long>(2l, 3l)), EPSILON);
+    assertEquals(0.25, similarities.get(new Pair<Long, Long>(2l, 4l)), EPSILON);
+    assertEquals(0.25, similarities.get(new Pair<Long, Long>(3l, 4l)), EPSILON);
   }
 
   /**
@@ -880,11 +893,19 @@ public class RecommenderJobTest extends 
      assertEquals(3.5, recommendedItem.getValue(), 0.05);
    }
 
+  static Map<Pair<Long,Long>, Double> readSimilarities(File file) throws IOException {
+    Map<Pair<Long,Long>, Double> similarities = Maps.newHashMap();
+    for (String line : new FileLineIterable(file)) {
+      String[] parts = line.split("\t");
+      similarities.put(new Pair<Long,Long>(Long.parseLong(parts[0]), Long.parseLong(parts[1])),
+          Double.parseDouble(parts[2]));
+    }
+    return similarities;
+  }
 
   static Map<Long,List<RecommendedItem>> readRecommendations(File file) throws IOException {
     Map<Long,List<RecommendedItem>> recommendations = Maps.newHashMap();
-    Iterable<String> lineIterable = new FileLineIterable(file);
-    for (String line : lineIterable) {
+    for (String line : new FileLineIterable(file)) {
 
       String[] keyValue = line.split("\t");
       long userID = Long.parseLong(keyValue[0]);