You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2009/10/20 14:19:19 UTC
svn commit: r827046 -
/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommenderJob.java
Author: srowen
Date: Tue Oct 20 12:19:19 2009
New Revision: 827046
URL: http://svn.apache.org/viewvc?rev=827046&view=rev
Log:
Committing my Hadoop 0.20.x changes even though it still runs into a Hadoop bug
Modified:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommenderJob.java
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommenderJob.java
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommenderJob.java?rev=827046&r1=827045&r2=827046&view=diff
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommenderJob.java (original)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/RecommenderJob.java Tue Oct 20 12:19:19 2009
@@ -32,10 +32,8 @@
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;
import org.apache.mahout.cf.taste.recommender.Recommender;
@@ -47,21 +45,68 @@
/**
* <p>This class configures and runs a {@link RecommenderMapper} using Hadoop.</p>
*
- * <p>Command line arguments are:</p> <ol> <li>Fully-qualified class name of {@link Recommender} to use to make
- * recommendations. Note that it must have a constructor which takes a {@link org.apache.mahout.cf.taste.model.DataModel}
- * argument.</li> <li>Number of recommendations to compute per user</li> <li>Location of a text file containing user IDs
- * for which recommendations should be computed, one per line</li> <li>Location of a data model file containing
- * preference data, suitable for use with {@link org.apache.mahout.cf.taste.impl.model.file.FileDataModel}</li>
- * <li>Output path where reducer output should go</li> </ol>
+ * <p>Command line arguments are:</p>
*
- * <p>Example:</p>
+ * <ol>
+ * <li>Fully-qualified class name of {@link Recommender} to use to make
+ * recommendations. Note that it must have a constructor which takes a
+ * {@link org.apache.mahout.cf.taste.model.DataModel} argument.</li>
+ * <li>Number of recommendations to compute per user</li>
+ * <li>Location of a text file containing user IDs
+ * for which recommendations should be computed, one per line</li>
+ * <li>Location of a data model file containing preference data,
+ * suitable for use with {@link org.apache.mahout.cf.taste.impl.model.file.FileDataModel}</li>
+ * <li>Output path where reducer output should go</li>
+ * </ol>
+ *
+ * <p>Example arguments:</p>
*
* <p><code>org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender 10 path/to/users.txt
- * path/to/data.csv path/to/reducerOutputDir 5</code></p>
+ * path/to/data.csv path/to/reducerOutputDir</code></p>
+ *
+ * <p>
+ * Set up Hadoop in a pseudo-distributed manner: http://hadoop.apache.org/common/docs/current/quickstart.html
+ * You can stop at the point where it instructs you to copy files into HDFS. Instead, proceed as follow.</p>
+ *
+ * {@code
+ * hadoop fs -mkdir input
+ * hadoop fs -mkdir output
+ * }
+ *
+ * <p>We need to massage the BX input a little bit and also create a file of user IDs:</p>
+ *
+ * {@code
+ * tail +2 BX-Book-Ratings.csv | tr -cd '[:digit:];\n' | tr ';' ',' | grep -v ',,' > input.csv
+ * # Mac users: put "export LC_ALL=C;" at the front of this command. You may want to "unset LC_ALL" after.
+ * cut -d, -f1 input.csv | uniq > users.txt
+ * }
+ *
+ * <p>Now we put the file in input/ and prepare output/:</p>
+ *
+ * {@code
+ * hadoop fs -put input.csv input/input.csv
+ * hadoop fs -put users.txt input/users.txt
+ * hadoop fs -mkdir output/
+ * }
+ *
+ * <p>Now build Mahout code using your IDE, or Maven. Note where the compiled classes go. If you built with
+ * Maven, it'll be like (Mahout directory)/core/target/classes/. Prepare a .jar file for Hadoop:</p>
+ *
+ * {@code
+ * jar cvf recommender.jar -C (classes directory) .
+ * }
+ *
+ * <p>And launch:</p>
+ *
+ * {@code
+ * hadoop jar recommender.jar org.apache.mahout.cf.taste.hadoop.RecommenderJob \
+ * org.apache.mahout.cf.taste.impl.recommender.slopeone.SlopeOneRecommender \
+ * 10 input/users.txt input/input.csv recommender.jar output
+ * }
*/
public final class RecommenderJob extends Job {
- /** Logger for this class. */
- private static Logger LOG = Logger.getLogger(SlopeOneDiffsToAveragesJob.class);
+
+ private static final Logger log = Logger.getLogger(RecommenderJob.class);
public RecommenderJob(Configuration jobConf) throws IOException {
super(jobConf);
@@ -87,6 +132,11 @@
Option dataModelFileOpt = obuilder.withLongName("dataModelFile").withRequired(true)
.withShortName("m").withArgument(abuilder.withName("dataModelFile").withMinimum(1)
.withMaximum(1).create()).withDescription("File containing data model.").create();
+
+ Option jarFileOpt = obuilder.withLongName("jarFile").withRequired(true)
+ .withShortName("m").withArgument(abuilder.withName("jarFile").withMinimum(1)
+ .withMaximum(1).create()).withDescription("Implementation jar.").create();
+
Option outputOpt = DefaultOptionCreator.outputOption(obuilder, abuilder).create();
Option helpOpt = DefaultOptionCreator.helpOption(obuilder);
@@ -108,13 +158,14 @@
int recommendationsPerUser = Integer.parseInt(cmdLine.getValue(userRecommendOpt).toString());
String userIDFile = cmdLine.getValue(userIDFileOpt).toString();
String dataModelFile = cmdLine.getValue(dataModelFileOpt).toString();
+ String jarFile = cmdLine.getValue(jarFileOpt).toString();
String outputPath = cmdLine.getValue(outputOpt).toString();
Configuration jobConf =
- buildJobConf(recommendClassName, recommendationsPerUser, userIDFile, dataModelFile, outputPath);
+ buildJobConf(recommendClassName, recommendationsPerUser, userIDFile, dataModelFile, jarFile, outputPath);
Job job = new RecommenderJob(jobConf);
job.waitForCompletion(true);
} catch (OptionException e) {
- LOG.error(e.getMessage());
+ log.error(e.getMessage());
CommandLineUtil.printHelp(group);
}
}
@@ -123,6 +174,7 @@
int recommendationsPerUser,
String userIDFile,
String dataModelFile,
+ String jarFile,
String outputPath) throws IOException {
Configuration jobConf = new Configuration();
@@ -135,6 +187,8 @@
fs.delete(outputPathPath, true);
}
+ jobConf.set("mapred.jar", jarFile);
+
jobConf.set(RecommenderMapper.RECOMMENDER_CLASS_NAME, recommendClassName);
jobConf.set(RecommenderMapper.RECOMMENDATIONS_PER_USER, String.valueOf(recommendationsPerUser));
jobConf.set(RecommenderMapper.DATA_MODEL_FILE, dataModelFile);
@@ -150,7 +204,7 @@
jobConf.setClass("mapred.output.key.class", LongWritable.class, Object.class);
jobConf.setClass("mapred.output.value.class", RecommendedItemsWritable.class, Object.class);
- jobConf.setClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class);
+ //jobConf.setClass("mapred.output.format.class", TextOutputFormat.class, OutputFormat.class);
jobConf.set("mapred.output.dir", StringUtils.escapeString(outputPathPath.toString()));
return jobConf;