You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ro...@apache.org on 2012/06/04 17:54:18 UTC

svn commit: r1346021 - in /mahout/trunk: bin/mahout examples/bin/asf-email-examples.sh

Author: robinanil
Date: Mon Jun  4 15:54:18 2012
New Revision: 1346021

URL: http://svn.apache.org/viewvc?rev=1346021&view=rev
Log:
MAHOUT-1006 Fixes to run asf classification examples on naivebayes

Modified:
    mahout/trunk/bin/mahout
    mahout/trunk/examples/bin/asf-email-examples.sh

Modified: mahout/trunk/bin/mahout
URL: http://svn.apache.org/viewvc/mahout/trunk/bin/mahout?rev=1346021&r1=1346020&r2=1346021&view=diff
==============================================================================
--- mahout/trunk/bin/mahout (original)
+++ mahout/trunk/bin/mahout Mon Jun  4 15:54:18 2012
@@ -184,6 +184,7 @@ fi
 
 MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.dir=$MAHOUT_LOG_DIR"
 MAHOUT_OPTS="$MAHOUT_OPTS -Dhadoop.log.file=$MAHOUT_LOGFILE"
+MAHOUT_OPTS="$MAHOUT_OPTS -Dmapred.min.split.size=256MB"
 
 if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
   MAHOUT_OPTS="$MAHOUT_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"

Modified: mahout/trunk/examples/bin/asf-email-examples.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/asf-email-examples.sh?rev=1346021&r1=1346020&r2=1346021&view=diff
==============================================================================
--- mahout/trunk/examples/bin/asf-email-examples.sh (original)
+++ mahout/trunk/examples/bin/asf-email-examples.sh Mon Jun  4 15:54:18 2012
@@ -58,7 +58,6 @@ START_PATH=`pwd`
 MAHOUT="../../bin/mahout"
 ASF_ARCHIVES=$1
 OUT=$2
-export MAHOUT_HEAPSIZE=2048
 
 algorithm=( recommender clustering classification clean )
 if [ -n "$3" ]; then
@@ -182,10 +181,10 @@ elif [ "x$alg" == "xclassification" ]; t
   # the label is the project_name_mailing_list, as in tomcat.apache.org_dev
   #Convert to vectors
   if [ "x$classAlg" == "xstandard" ] || [ "x$classAlg" == "xcomplementary" ]; then
+	set -x
     CLASS="$OUT/classification/bayesian"
     MAIL_OUT="$CLASS/seq-files"
     SEQ2SP="$CLASS/seq2sparse"
-    SEQ2SPLABEL="$CLASS/labeled"
     SPLIT="$CLASS/splits"
     TRAIN="$SPLIT/train"
     TEST="$SPLIT/test"
@@ -193,19 +192,16 @@ elif [ "x$alg" == "xclassification" ]; t
     LABEL="$SPLIT/labels"
     if ! fileExists "$MAIL_OUT/chunk-0"; then
       echo "Converting Mail files to Sequence Files"
-      $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset "UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT
+      $MAHOUT org.apache.mahout.text.SequenceFilesFromMailArchives --charset "UTF-8" --subject --body --input $ASF_ARCHIVES --output $MAIL_OUT -chunk 128
     fi
     if ! fileExists "$SEQ2SP/dictionary.file-0"; then
       echo "Converting the files to sparse vectors"
-      $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName org.apache.mahout.text.MailArchivesClusteringAnalyzer
-      #We need to modify the vectors to have a better label
-      echo "Converting vector labels"
-      $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite --maxItemsPerLabel 1000
+      $MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight TFIDF --namedVector -lnorm --maxDFPercent 90 --minSupport 2 --analyzerName org.apache.mahout.text.MailArchivesClusteringAnalyzer -chunk 1000
     fi
     if ! fileExists "$TRAIN/part-m-00000"; then
       #setup train/test files
       echo "Creating training and test inputs"
-      $MAHOUT split --input $SEQ2SPLABEL --trainingOutput $TRAIN --testOutput $TEST --randomSelectionPct 20 --overwrite --sequenceFiles
+      $MAHOUT split --input $SEQ2SP/tfidf-vectors --trainingOutput $TRAIN --testOutput $TEST --randomSelectionPct 20 --overwrite --sequenceFiles -xm sequential
     fi
     MODEL="$CLASS/model"
     if [ "x$classAlg" == "xstandard" ]; then