You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@mahout.apache.org by Erdem Sahin <es...@nyu.edu> on 2015/04/28 01:57:20 UTC
trainnb labelindex not found error - help requested

Hi Mahout users,

I'm trying to run the classify-20-newsgroups.sh  script
<https://github.com/apache/mahout/blob/master/examples/bin/classify-20newsgroups.sh>
and
it fails with a FileNotFoundException when it gets to the "trainnb"
command. All prior steps run successfully. I'm trying algo 1 or algo 2.

I have modified the script slightly so that it reads my input data instead
of the canonical data set. I've created a "wifidata" folder on the local FS
which has the following structure:
wifidata/havewifi
wifidata/nowifi

and within havewifi and nowifi, there exist files with text file names and
text content.These eventually get copied to HDFS.

I'm not clear if the "labelindex" file, which cannot be found, is supposed
to be created by trainnb or by a prior step.

Please see the details of the modified script and the error below. Any help
would be appreciated.

Thanks and best regards,
Erdem Sahin

Script:

if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
  echo "This script runs SGD and Bayes classifiers over the classic 20 News
Groups."
  exit
fi

SCRIPT_PATH=${0%/*}
if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
  cd $SCRIPT_PATH
fi
START_PATH=`pwd`

# Set commands for dfs
source ${START_PATH}/set-dfs-commands.sh

WORK_DIR=/tmp/mahout-work-${USER}
algorithm=( cnaivebayes-MapReduce naivebayes-MapReduce cnaivebayes-Spark
naivebayes-Spark sgd clean)
if [ -n "$1" ]; then
  choice=$1
else
  echo "Please select a number to choose the corresponding task to run"
  echo "1. ${algorithm[0]}"
  echo "2. ${algorithm[1]}"
  echo "3. ${algorithm[2]}"
  echo "4. ${algorithm[3]}"
  echo "5. ${algorithm[4]}"
  echo "6. ${algorithm[5]}-- cleans up the work area in $WORK_DIR"
  read -p "Enter your choice : " choice
fi

echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
alg=${algorithm[$choice-1]}

# Spark specific check and work
if [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" == "xcnaivebayes-Spark" ];
then
  if [ "$MASTER" == "" ] ; then
    echo "Please set your MASTER env variable to point to your Spark Master
URL. exiting..."
    exit 1
  fi
  if [ "$MAHOUT_LOCAL" != "" ] ; then
    echo "Options 3 and 4 can not run in MAHOUT_LOCAL mode. exiting..."
    exit 1
  fi
fi

#echo $START_PATH
cd $START_PATH
cd ../..

set -e

if  ( [ "x$alg" == "xnaivebayes-MapReduce" ] ||  [ "x$alg" ==
"xcnaivebayes-MapReduce" ] || [ "x$alg" == "xnaivebayes-Spark"  ] || [
"x$alg" == "xcnaivebayes-Spark" ] ); then
  c=""

  if [ "x$alg" == "xcnaivebayes-MapReduce" -o "x$alg" ==
"xnaivebayes-Spark" ]; then
    c=" -c"
  fi

  set -x
  echo "Preparing 20newsgroups data"
  rm -rf ${WORK_DIR}/20news-all
  mkdir ${WORK_DIR}/20news-all
  cp -R $START_PATH/wifidata/* ${WORK_DIR}/20news-all


  echo "Copying 20newsgroups data to HDFS"
  set +e
  $DFSRM ${WORK_DIR}/20news-all
  $DFS -mkdir ${WORK_DIR}
  $DFS -mkdir ${WORK_DIR}/20news-all
  set -e
  if [ $HVERSION -eq "1" ] ; then
      echo "Copying 20newsgroups data to Hadoop 1 HDFS"
      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/20news-all
  elif [ $HVERSION -eq "2" ] ; then
      echo "Copying 20newsgroups data to Hadoop 2 HDFS"
      $DFS -put ${WORK_DIR}/20news-all ${WORK_DIR}/
  fi


  echo "Creating sequence files from 20newsgroups data"
  /usr/bin/mahout seqdirectory \
    -i ${WORK_DIR}/20news-all \
    -o ${WORK_DIR}/20news-seq -ow

  echo "Converting sequence files to vectors"
  /usr/bin/mahout seq2sparse \
    -i ${WORK_DIR}/20news-seq \
    -o ${WORK_DIR}/20news-vectors  -lnorm -nv  -wt tfidf

  echo "Creating training and holdout set with a random 80-20 split of the
generated vector dataset"
  /usr/bin/mahout split \
    -i ${WORK_DIR}/20news-vectors/tfidf-vectors \
    --trainingOutput ${WORK_DIR}/20news-train-vectors \
    --testOutput ${WORK_DIR}/20news-test-vectors  \
    --randomSelectionPct 40 --overwrite --sequenceFiles -xm sequential

    if [ "x$alg" == "xnaivebayes-MapReduce"  -o  "x$alg" ==
"xcnaivebayes-MapReduce" ]; then

      echo "Training Naive Bayes model"
      /usr/bin/mahout trainnb \
        -i ${WORK_DIR}/20news-train-vectors \
        -o ${WORK_DIR}/model \
        -li ${WORK_DIR}/labelindex \
        -ow $c

      echo "Self testing on training set"

      /usr/bin/mahout testnb \
        -i ${WORK_DIR}/20news-train-vectors\
        -m ${WORK_DIR}/model \
        -l ${WORK_DIR}/labelindex \
        -ow -o ${WORK_DIR}/20news-testing $c

      echo "Testing on holdout set"

      /usr/bin/mahout testnb \
        -i ${WORK_DIR}/20news-test-vectors\
        -m ${WORK_DIR}/model \
        -l ${WORK_DIR}/labelindex \
        -ow -o ${WORK_DIR}/20news-testing $c

    elif [ "x$alg" == "xnaivebayes-Spark" -o "x$alg" ==
"xcnaivebayes-Spark" ]; then

      echo "Training Naive Bayes model"
      /usr/bin/mahout spark-trainnb \
        -i ${WORK_DIR}/20news-train-vectors \
        -o ${WORK_DIR}/spark-model $c -ow -ma $MASTER

      echo "Self testing on training set"
      /usr/bin/mahout spark-testnb \
        -i ${WORK_DIR}/20news-train-vectors\
        -m ${WORK_DIR}/spark-model $c -ma $MASTER

      echo "Testing on holdout set"
      /usr/bin/mahout spark-testnb \
        -i ${WORK_DIR}/20news-test-vectors\
        -m ${WORK_DIR}/spark-model $c -ma $MASTER

    fi
elif [ "x$alg" == "xsgd" ]; then
  if [ ! -e "/tmp/news-group.model" ]; then
    echo "Training on ${WORK_DIR}/20news-bydate/20news-bydate-train/"
    /usr/bin/mahout org.apache.mahout.classifier.sgd.TrainNewsGroups
${WORK_DIR}/20news-bydate/20news-bydate-train/
  fi
  echo "Testing on ${WORK_DIR}/20news-bydate/20news-bydate-test/ with
model: /tmp/news-group.model"
  /usr/bin/mahout org.apache.mahout.classifier.sgd.TestNewsGroups --input
${WORK_DIR}/20news-bydate/20news-bydate-test/ --model /tmp/news-group.model
elif [ "x$alg" == "xclean" ]; then
  rm -rf $WORK_DIR
  rm -rf /tmp/news-group.model
  $DFSRM $WORK_DIR
fi
# Remove the work directory
#

Error mesage:

$  /usr/bin/mahout trainnb \
>         -i ${WORK_DIR}/20news-train-vectors \
>         -o ${WORK_DIR}/model \
>         -li ${WORK_DIR}/labelindex \
>         -ow
MAHOUT_LOCAL is not set; adding HADOOP_CONF_DIR to classpath.
Running on hadoop, using /usr/lib/hadoop/bin/hadoop and
HADOOP_CONF_DIR=/etc/hadoop/conf
MAHOUT-JOB: /usr/lib/mahout/mahout-examples-0.9-cdh5.3.0-job.jar
15/04/27 16:41:27 WARN driver.MahoutDriver: No trainnb.props found on
classpath, will use command-line arguments only
15/04/27 16:41:28 INFO common.AbstractJob: Command line arguments:
{--alphaI=[1.0], --endPhase=[2147483647],
--input=[/tmp/mahout-work-cloudera/20news-train-vectors],
--labelIndex=[/tmp/mahout-work-cloudera/labelindex],
--output=[/tmp/mahout-work-cloudera/model], --overwrite=null,
--startPhase=[0], --tempDir=[temp]}
15/04/27 16:41:36 INFO common.HadoopUtil: Deleting temp
15/04/27 16:41:36 INFO Configuration.deprecation: mapred.input.dir is
deprecated. Instead, use mapreduce.input.fileinputformat.inputdir
15/04/27 16:41:36 INFO Configuration.deprecation:
mapred.compress.map.output is deprecated. Instead, use
mapreduce.map.output.compress
15/04/27 16:41:36 INFO Configuration.deprecation: mapred.output.dir is
deprecated. Instead, use mapreduce.output.fileoutputformat.outputdir
15/04/27 16:41:36 INFO client.RMProxy: Connecting to ResourceManager at /
0.0.0.0:8032
15/04/27 16:41:44 INFO mapreduce.JobSubmitter: Cleaning up the staging area
/tmp/hadoop-yarn/staging/cloudera/.staging/job_1430097605337_0028
15/04/27 16:41:44 WARN security.UserGroupInformation:
PriviledgedActionException as:cloudera (auth:SIMPLE)
cause:java.io.FileNotFoundException: File does not exist:
/tmp/mahout-work-cloudera/labelindex
Exception in thread "main" java.io.FileNotFoundException: File does not
exist: /tmp/mahout-work-cloudera/labelindex
at
org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1093)
at
org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1085)
at
org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at
org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1085)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:288)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.getFileStatus(ClientDistributedCacheManager.java:224)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestamps(ClientDistributedCacheManager.java:93)
at
org.apache.hadoop.mapreduce.filecache.ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(ClientDistributedCacheManager.java:57)
at
org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:267)
at
org.apache.hadoop.mapreduce.JobSubmitter.copyAndConfigureFiles(JobSubmitter.java:388)
at
org.apache.hadoop.mapreduce.JobSubmitter.submitJobInternal(JobSubmitter.java:481)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1295)
at org.apache.hadoop.mapreduce.Job$10.run(Job.java:1292)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:415)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1642)
at org.apache.hadoop.mapreduce.Job.submit(Job.java:1292)
at org.apache.hadoop.mapreduce.Job.waitForCompletion(Job.java:1313)
at
org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.run(TrainNaiveBayesJob.java:114)
at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:70)
at
org.apache.mahout.classifier.naivebayes.training.TrainNaiveBayesJob.main(TrainNaiveBayesJob.java:64)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at
org.apache.hadoop.util.ProgramDriver$ProgramDescription.invoke(ProgramDriver.java:72)
at org.apache.hadoop.util.ProgramDriver.run(ProgramDriver.java:145)
at org.apache.hadoop.util.ProgramDriver.driver(ProgramDriver.java:153)
at org.apache.mahout.driver.MahoutDriver.main(MahoutDriver.java:195)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.main(RunJar.java:212)