You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by dr...@apache.org on 2011/05/23 14:27:00 UTC

svn commit: r1126470 - /mahout/trunk/examples/bin/build-reuters.sh

Author: drew
Date: Mon May 23 12:27:00 2011
New Revision: 1126470

URL: http://svn.apache.org/viewvc?rev=1126470&view=rev
Log:
Fix for build-reuters.sh per MAHOUT-694

Modified:
    mahout/trunk/examples/bin/build-reuters.sh

Modified: mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/build-reuters.sh?rev=1126470&r1=1126469&r2=1126470&view=diff
==============================================================================
--- mahout/trunk/examples/bin/build-reuters.sh (original)
+++ mahout/trunk/examples/bin/build-reuters.sh Mon May 23 12:27:00 2011
@@ -21,7 +21,18 @@
 #
 # To run:  change into the mahout directory and type:
 #  examples/bin/build-reuters.sh
-# or run from the examples/bin directory directly.
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then 
+  cd $SCRIPT_PATH
+fi
+
+MAHOUT="../../bin/mahout"
+
+if [ ! -e $MAHOUT ]; then
+  echo "Can't find mahout driver in $MAHOUT, cwd `pwd`, exiting.."
+  exit 1
+fi
 
 if [ "$1" = "-ni" ]; then
   clustertype=kmeans
@@ -37,60 +48,80 @@ else
   clustertype=${algorithm[$choice-1]} 
 fi
 
-cd examples/bin/
-mkdir -p work
-if [ ! -e work/reuters-out ]; then
-  if [ ! -e work/reuters-sgm ]; then
-    if [ ! -f work/reuters21578.tar.gz ]; then
-      echo "Downloading Reuters-21578"
-      curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz  -o work/reuters21578.tar.gz
+mkdir -p mahout-work
+
+if [ ! -e mahout-work/reuters-out-seqdir ]; then
+    if [ ! -e mahout-work/reuters-out ]; then
+	if [ ! -e mahout-work/reuters-sgm ]; then
+	    if [ ! -f mahout-work/reuters21578.tar.gz ]; then
+		echo "Downloading Reuters-21578"
+		curl http://kdd.ics.uci.edu/databases/reuters21578/reuters21578.tar.gz \
+                     -o mahout-work/reuters21578.tar.gz
+	    fi
+	    mkdir -p mahout-work/reuters-sgm
+	    echo "Extracting..."
+	    cd mahout-work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
+	fi
+	
+	$MAHOUT org.apache.lucene.benchmark.utils.ExtractReuters \
+	    mahout-work/reuters-sgm \
+	    mahout-work/reuters-out 
     fi
-    mkdir -p work/reuters-sgm
-    echo "Extracting..."
-    cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
-  fi
+
+    MAHOUT_LOCAL=true $MAHOUT seqdirectory \
+        -i mahout-work/reuters-out \
+        -o mahout-work/reuters-out-seqdir \
+        -c UTF-8 -chunk 5
 fi
-cd ../..
 
-./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters \
-  ./examples/bin/work/reuters-sgm/ \
-  ./examples/bin/work/reuters-out \
-&& \
-./bin/mahout seqdirectory \
-  -i ./examples/bin/work/reuters-out/ \
-  -o ./examples/bin/work/reuters-out-seqdir \
-  -c UTF-8 -chunk 5
+# we know reuters-out-seqdir exists on a local disk at
+# this point, if we're running in clustered mode, 
+# copy it up to hdfs
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    HADOOP="$HADOOP_HOME/bin/hadoop"
+    if [ ! -e $HADOOP ]; then 
+	echo "Can't find hadoop in $HADOOP, exiting"
+	exit 1
+    fi
+
+    set +e
+    $HADOOP dfs -rmr \
+        mahout-work/reuters-out-seqdir
+    set -e
+    $HADOOP dfs -put \
+        mahout-work/reuters-out-seqdir \
+        mahout-work/reuters-out-seqdir
+fi
 
 if [ "x$clustertype" == "xkmeans" ]; then
-  ./bin/mahout seq2sparse \
-    -i ./examples/bin/work/reuters-out-seqdir/ \
-    -o ./examples/bin/work/reuters-out-seqdir-sparse \
+  $MAHOUT seq2sparse \
+    -i mahout-work/reuters-out-seqdir/ \
+    -o mahout-work/reuters-out-seqdir-sparse-kmeans \
   && \
-  ./bin/mahout kmeans \
-    -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf-vectors/ \
-    -c ./examples/bin/work/clusters \
-    -o ./examples/bin/work/reuters-kmeans \
+  $MAHOUT kmeans \
+    -i mahout-work/reuters-out-seqdir-sparse-kmeans/tfidf-vectors/ \
+    -c mahout-work/reuters-kmeans-clusters \
+    -o mahout-work/reuters-kmeans \
     -x 10 -k 20 -ow \
   && \
-  ./bin/mahout clusterdump \
-    -s examples/bin/work/reuters-kmeans/clusters-10 \
-    -d examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 \
+  $MAHOUT clusterdump \
+    -s mahout-work/reuters-kmeans/clusters-10 \
+    -d mahout-work/reuters-out-seqdir-sparse-kmeans/dictionary.file-0 \
     -dt sequencefile -b 100 -n 20
 elif [ "x$clustertype" == "xlda" ]; then
-  ./bin/mahout seq2sparse \
-    -i ./examples/bin/work/reuters-out-seqdir/ \
-    -o ./examples/bin/work/reuters-out-seqdir-sparse \
+  $MAHOUT seq2sparse \
+    -i mahout-work/reuters-out-seqdir/ \
+    -o mahout-work/reuters-out-seqdir-sparse-lda \
     -wt tf -seq -nr 3 \
   && \
-  ./bin/mahout lda \
-    -i ./examples/bin/work/reuters-out-seqdir-sparse/tf-vectors \
-    -o ./examples/bin/work/reuters-lda -k 20 -v 50000 -ow -x 20 \
+  $MAHOUT lda \
+    -i mahout-work/reuters-out-seqdir-sparse-lda/tf-vectors \
+    -o mahout-work/reuters-lda -k 20 -v 50000 -ow -x 20 \
   && \
-  ./bin/mahout ldatopics \
-    -i ./examples/bin/work/reuters-lda/state-20 \
-    -d ./examples/bin/work/reuters-out-seqdir-sparse/dictionary.file-0 \
+  $MAHOUT ldatopics \
+    -i mahout-work/reuters-lda/state-20 \
+    -d mahout-work/reuters-out-seqdir-sparse-lda/dictionary.file-0 \
     -dt sequencefile
 else 
   echo "unknown cluster type: $clustertype";
 fi 
-