You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by gs...@apache.org on 2011/10/10 18:36:35 UTC
svn commit: r1181061 - /mahout/trunk/examples/bin/build-asf-email.sh
Author: gsingers
Date: Mon Oct 10 16:36:35 2011
New Revision: 1181061
URL: http://svn.apache.org/viewvc?rev=1181061&view=rev
Log:
MAHOUT-798: restrict the number of items per label to avoid overtraining
Modified:
mahout/trunk/examples/bin/build-asf-email.sh
Modified: mahout/trunk/examples/bin/build-asf-email.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-asf-email.sh?rev=1181061&r1=1181060&r2=1181061&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-asf-email.sh (original)
+++ mahout/trunk/examples/bin/build-asf-email.sh Mon Oct 10 16:36:35 2011
@@ -126,7 +126,7 @@ elif [ "x$alg" == "xclassification" ]; t
$MAHOUT seq2sparse --input $MAIL_OUT --output $SEQ2SP --norm 2 --weight TFIDF --namedVector --maxDFPercent 90 --minSupport 2 --analyzerName org.apache.mahout.text.MailArchivesClusteringAnalyzer
#We need to modify the vectors to have a better label
echo "Converting vector labels"
- $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite
+ $MAHOUT org.apache.mahout.classifier.email.PrepEmailVectorsDriver --input "$SEQ2SP/tfidf-vectors" --output $SEQ2SPLABEL --overwrite --maxItemsPerLabel 1000
fi
if [ "x$OVER" == "xover" ] || [ ! -e "$TRAIN/part-m-00000" ]; then
#setup train/test files