You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sr...@apache.org on 2009/12/24 12:58:36 UTC
svn commit: r893734 - in
/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence:
prepare.pl run.sh
Author: srowen
Date: Thu Dec 24 11:58:35 2009
New Revision: 893734
URL: http://svn.apache.org/viewvc?rev=893734&view=rev
Log:
Forgot these as part of MAHOUT-103
Added:
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/prepare.pl
lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/run.sh
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/prepare.pl
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/prepare.pl?rev=893734&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/prepare.pl (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/prepare.pl Thu Dec 24 11:58:35 2009
@@ -0,0 +1,88 @@
+#!/usr/bin/perl -w
+
+my $counter=0;
+my $totalFiles=0;
+my $step=1;
+local $| = 1;
+
+sub extract($) {
+ my $file = shift;
+ print "Now extracting: [$file]\n";
+ system( "tar xf " . $file );
+ my $ret = $?;
+ if ( $ret == 0 ) {
+ print "[$file] extracted successfully.\n";
+ }
+ else {
+ print "Failed to extract file: [$file]\n";
+ }
+ return $ret;
+}
+
+sub convert(@) {
+ my $id = shift;
+ my $outFileName = "User_history_$id.dat";
+
+ local *outFile;
+ local *inFile;
+ open outFile, ">$outFileName"
+ or die "Cannot open [$outFileName] for writing.\n";
+ for my $file (@_) {
+ open inFile, "<training_set/$file" or die "Cannot open [training_set/$file] for reading.\n";
+ my $movieId = <inFile>;
+ my $userDetails;
+ $movieId = substr $movieId, 0, length($movieId) - 2;
+ while (<inFile>) {
+ $userDetails = $_;
+ $userDetails =~ s/,(\d),/"\t$movieId\t$1\t"/eg;
+ print outFile $userDetails;
+ }
+ close inFile;
+ $counter++;
+ if( (($counter*100.0)/$totalFiles) >= $step) {
+ print "\nTotal Completed: $step %";
+ $step++;
+ }
+ }
+ close outFile;
+}
+
+sub start($@) {
+ my $parts = shift;
+ my @allFiles = @_;
+ $totalFiles = @allFiles;
+ my $partSize = ( $totalFiles / $parts );
+ $partSize = $partSize < 1 ? 1 : $partSize;
+ my $uid = 0;
+ print "Total files to be converted: [$totalFiles]\n";
+ print "Staring data conversion ...";
+ for ( my $start = 0 ; $start < $totalFiles ; $start += $partSize ) {
+ convert( $uid, @allFiles[ $start .. ( $start + $partSize - 1 ) ] );
+ $uid++;
+ }
+
+}
+
+sub main {
+ if ( extract("training_set.tar") == 0 ) {
+ opendir DIR, "training_set";
+ my @files = ( readdir DIR );
+ for ( my $i = 0 ; $i < @files ; ) {
+ if ( ( substr $files[$i], 0, 1 ) eq '.' ) {
+ splice @files, $i, 1, ();
+ }
+ else {
+ $i++;
+ }
+ }
+ start(10, @files );
+ if($? ==0) {
+ print ("\nCompleted!\n");
+ system("rm -rf training_set");
+ } else {
+ print ("Data Conversion failed\n");
+ }
+ }
+}
+
+main
Added: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/run.sh
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/run.sh?rev=893734&view=auto
==============================================================================
--- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/run.sh (added)
+++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/cooccurence/run.sh Thu Dec 24 11:58:35 2009
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+TASK_OPTS=-Dmapred.job.queue.name=grideng\ -Dmapred.map.tasks.speculative.execution=true\ -Dmapred.reduce.tasks.speculative.execution=true
+#JAVA_OPTS=-Dmapred.child.java.opts=-Xmx1280m\ -server\ -Djava.net.preferIPv4Stack=true
+NETFLIX_IN_DIR=netflix-data
+NETFLIX_OUT_DIR=netflix-out
+BIGRAMS_OUT_DIR=$NETFLIX_OUT_DIR/bigrams
+SIMILAR_MOVIES_OUT_DIR=$NETFLIX_OUT_DIR/similarMovies
+USER_MOVIES_JOINED_DIR=$NETFLIX_OUT_DIR/user_movies_joined
+RECOMMENDATIONS_DIR=$NETFLIX_OUT_DIR/recommendations
+MAX_RECOMMENDATIONS=50
+MAX_SIMILAR_MOVIES=100
+REDUCERS=200
+
+echo "Generating bigrams for movie similarity computation ..."
+hadoop dfs -rmr $BIGRAMS_OUT_DIR
+hadoop jar mahout-core-0.3-SNAPSHOT.jar org.apache.mahout.cf.taste.hadoop.cooccurence.ItemBigramGenerator -Dmapred.child.java.opts=-Xmx1280m\ -server\ -Djava.net.preferIPv4Stack=true $TASK_OPTS $NETFLIX_IN_DIR $BIGRAMS_OUT_DIR $REDUCERS
+echo "Done."
+echo "Computing co-occurrence based movie similarity scores ..."
+hadoop dfs -rmr $SIMILAR_MOVIES_OUT_DIR
+hadoop jar mahout-core-0.3-SNAPSHOT.jar org.apache.mahout.cf.taste.hadoop.cooccurence.ItemSimilarityEstimator -Dmapred.child.java.opts=-Xmx1280m\ -server\ -Djava.net.preferIPv4Stack=true $TASK_OPTS $BIGRAMS_OUT_DIR $SIMILAR_MOVIES_OUT_DIR $MAX_SIMILAR_MOVIES $REDUCERS
+echo "Done."
+echo "Joining User history with similar items ..."
+hadoop dfs -rmr $USER_MOVIES_JOINED_DIR
+hadoop jar mahout-core-0.3-SNAPSHOT.jar org.apache.mahout.cf.taste.hadoop.cooccurence.UserItemJoiner -Dmapred.child.java.opts=-Xmx1280m\ -server\ -Djava.net.preferIPv4Stack=true $TASK_OPTS $NETFLIX_IN_DIR $SIMILAR_MOVIES_OUT_DIR $USER_MOVIES_JOINED_DIR $REDUCERS
+echo "Done."
+
+echo "Generating recommendations now ..."
+hadoop dfs -rmr $RECOMMENDATIONS_DIR
+hadoop jar mahout-core-0.3-SNAPSHOT.jar org.apache.mahout.cf.taste.hadoop.cooccurence.UserItemRecommender -Dmapred.child.java.opts=-Xmx1280m\ -server\ -Djava.net.preferIPv4Stack=true $TASK_OPTS $USER_MOVIES_JOINED_DIR $RECOMMENDATIONS_DIR $MAX_RECOMMENDATIONS $REDUCERS
+