You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@mahout.apache.org by Erdem Sahin <es...@nyu.edu> on 2015/04/28 01:57:20 UTC
trainnb labelindex not found error - help requested
Hi Mahout users,
I'm trying to run the classify-20-newsgroups.sh script
<https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh>
and
it fails with a FileNotFoundException when it gets to the "trainnb"
command. All prior steps run successfully. I'm trying algo 1 or algo 2.
I have modified the script slightly so that it reads my input data instead
of the canonical data set. I've created a "wifidata" folder on the local FS
which has the following structure:
wifidata/havewifi
wifidata/nowifi
and within havewifi and nowifi, there exist files with text file names and
text content.These eventually get copied to HDFS.
I'm not clear if the "labelindex" file, which cannot be found, is supposed
to be created by trainnb or by a prior step.
Please see the details of the modified script and the error below. Any help
would be appreciated.
Thanks and best regards,
Erdem Sahin
Script:
if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
echo "This script runs SGD and Bayes classifiers over the classic 20 News
Groups."
exit
fi
SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
cd $SCRIPT_PATH
fi
START_PATH=`pwd`
# Set commands for dfs
source ${START_PATH}/set-dfs-commands.sh
WORK_DIR=/tmp/mahout-work-${USER}
algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark
naivebayes-Spark sgd clean)
if [ -n "$1" ]; then
choice=$1
else
echo "Please select a number to choose the corresponding task to run"
echo "1. ${algorithm[0]}"
echo "2. ${algorithm[1]}"
echo "3. ${algorithm[2]}"
echo "4. ${algorithm[3]}"
echo "5. ${algorithm[4]}"
echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
read -p "Enter your choice : " choice
fi
echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
alg=${algorithm[$choice-1]}
# Spark specific check and work
if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ];
then
if [ "$MASTER" == "" ] ; then
echo "Please set your MASTER env variable to point to your Spark Master
URL. exiting..."
exit 1
fi
if [ "$MAHOUT_LOCAL" != "" ] ; then
echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
exit 1
fi
fi
#echo $START_PATH
cd $START_PATH
cd ../..
set -e
if ( [ "x$alg" == "xnaivebayes-MapReduce" ] || [ "x$alg" ==
"xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark" ] || [
"x$alg" == "xcnaivebayes-Spark" ] ); then
c=""
if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" ==
"xnaivebayes-Spark" ]; then
c=" -c"
fi
set -x
echo "Preparing 20newsgroups data"
rm -rf ${WORK_DIR}/20news-all
mkdir ${WORK_DIR}/20news-all
cp -R $START_PATH/wifidata/* ${WORK_DIR}/20news-all
echo "Copying 20newsgroups data to HDFS"
set +e
$DFSRM ${WORK_DIR}/20news-all
$DFS -mkdir ${WORK_DIR}
$DFS -mkdir ${WORK_DIR}/20news-all
set -e
if [ $HVERSION -eq "1" ] ; then
echo "Copying 20newsgroups data to Hadoop 1 HDFS"
$DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
elif [ $HVERSION -eq "2" ] ; then
echo "Copying 20newsgroups data to Hadoop 2 HDFS"
$DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
fi
echo "Creating sequence files from 20newsgroups data"
/usr/bin/mahout seqdirectory \
-i ${WORK_DIR}/20news-all \
-o ${WORK_DIR}/20news-seq -ow
echo "Converting sequence files to vectors"
/usr/bin/mahout seq2sparse \
-i ${WORK_DIR}/20news-seq \
-o ${WORK_DIR}/20news-vectors -lnorm -nv -wt tfidf
echo "Creating training and holdout set with a random 80-20 split of the
generated vector dataset"
/usr/bin/mahout split \
-i ${WORK_DIR}/20news-vectors/tfidf-vectors \
--trainingOutput ${WORK_DIR}/20news-train-vectors \
--testOutput ${WORK_DIR}/20news-test-vectors \
--randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential
if [ "x$alg" == "xnaivebayes-MapReduce" -o "x$alg" ==
"xcnaivebayes-MapReduce" ]; then
echo "Training Naive Bayes model"
/usr/bin/mahout trainnb \
-i ${WORK_DIR}/20news-train-vectors \
-o ${WORK_DIR}/model \
-li ${WORK_DIR}/labelindex \
-ow $c
echo "Self testing on training set"
/usr/bin/mahout testnb \
-i ${WORK_DIR}/20news-train-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing $c
echo "Testing on holdout set"
/usr/bin/mahout testnb \
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/model \
-l ${WORK_DIR}/labelindex \
-ow -o ${WORK_DIR}/20news-testing $c
elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" ==
"xcnaivebayes-Spark" ]; then
echo "Training Naive Bayes model"
/usr/bin/mahout spark-trainnb \
-i ${WORK_DIR}/20news-train-vectors \
-o ${WORK_DIR}/spark-model $c -ow -ma $MASTER
echo "Self testing on training set"
/usr/bin/mahout spark-testnb \
-i ${WORK_DIR}/20news-train-vectors\
-m ${WORK_DIR}/spark-model $c -ma $MASTER
echo "Testing on holdout set"
/usr/bin/mahout spark-testnb \
-i ${WORK_DIR}/20news-test-vectors\
-m ${WORK_DIR}/spark-model $c -ma $MASTER
fi
elif [ "x$alg" == "xsgd" ]; then
if [ ! -e "/tmp/news-group.model" ]; then
echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
/usr/bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups
${WORK_DIR}/20news-bydate/20news-bydate-train/
fi
echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with
model: /tmp/news-group.model"
/usr/bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input
${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
elif [ "x$alg" == "xclean" ]; then
rm -rf $WORK_DIR
rm -rf /tmp/news-group.model
$DFSRM $WORK_DIR
fi
# Remove the work directory
#
Error mesage:
$ /usr/bin/mahout trainnb \
> -i ${WORK_DIR}/20news-train-vectors \
> -o ${WORK_DIR}/model \
> -li ${WORK_DIR}/labelindex \
> -ow
MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /usr/lib/hadoop/bin/hadoop and
HADOOP_CONF_DIR=/etc/hadoop/conf
MAHOUT-JOB: /usr/lib/mahout/mahout-examples-0.9-cdh5.3.0-job.jar
15/04/27 16:41:27 WARN driver.MahoutDriver: No trainnb.props found on
classpath, will use command-line arguments only
15/04/27 16:41:28 INFO common.AbstractJob: Command line arguments:
{--alphaI=[1.0], --endPhase=[2147483647],
--input=[/tmp/mahout-work-cloudera/20news-train-vectors],
--labelIndex=[/tmp/mahout-work-cloudera/labelindex],
--output=[/tmp/mahout-work-cloudera/model], --overwrite=null,
--startPhase=[0], --tempDir=[temp]}
15/04/27 16:41:36 INFO common.HadoopUtil: Deleting temp
15/04/27 16:41:36 INFO Configuration.deprecation: mapred.input.dir is
deprecated. Instead, use mapreduce.input.fileinputformat.inputdir
15/04/27 16:41:36 INFO Configuration.deprecation:
mapred.compress.map.output is deprecated. Instead, use
mapreduce.map.output.compress
15/04/27 16:41:36 INFO Configuration.deprecation: mapred.output.dir is
deprecated. Instead, use mapreduce.output.fileoutputformat.outputdir
15/04/27 16:41:36 INFO client.RMProxy: Connecting to ResourceManager at /
0.0.0.0:8032
15/04/27 16:41:44 INFO mapreduce.JobSubmitter: Cleaning up the staging area
/tmp/hadoop-yarn/staging/cloudera/.staging/job_1430097605337_0028
15/04/27 16:41:44 WARN security.UserGroupInformation:
PriviledgedActionException as:cloudera (auth:SIMPLE)
cause:java.io.FileNotFoundException: File does not exist:
/tmp/mahout-work-cloudera/labelindex
Exception in thread "main" java.io.FileNotFoundException: File does not
exist: /tmp/mahout-work-cloudera/labelindex
at
org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1093)
at
org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1085)
at
org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at
org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1085)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:288)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:224)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestamps(ClientDistributedCacheManager.java:93)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(ClientDistributedCacheManager.java:57)
at
org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:267)
at
org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:388)
at
org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:481)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1295)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1292)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1292)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1313)
at
org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.run(TrainNaiveBayesJob.java:114)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at
org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.main(TrainNaiveBayesJob.java:64)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.apache.hadoop.util.ProgramDriver$ProgramDescription.invoke(ProgramDriver.java:72)
at org.apache.hadoop.util.ProgramDriver.run(ProgramDriver.java:145)
at org.apache.hadoop.util.ProgramDriver.driver(ProgramDriver.java:153)
at org.apache.mahout.driver.MahoutDriver.main(MahoutDriver.java:195)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:212)