You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/06/08 18:27:34 UTC
svn commit: r1491007 - /mahout/trunk/examples/bin/cluster-reuters.sh

Author: smarthi
Date: Sat Jun  8 16:27:34 2013
New Revision: 1491007

URL: http://svn.apache.org/r1491007
Log:
MAHOUT-1026:  Add LDA (CVB implementation) to the cluster_reuters.sh example script

Modified:
    mahout/trunk/examples/bin/cluster-reuters.sh

Modified: mahout/trunk/examples/bin/cluster-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-reuters.sh?rev=1491007&r1=1491006&r2=1491007&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-reuters.sh (original)
+++ mahout/trunk/examples/bin/cluster-reuters.sh Sat Jun  8 16:27:34 2013
@@ -39,7 +39,7 @@ if [ ! -e $MAHOUT ]; then
   exit 1
 fi
 
-algorithm=( kmeans fuzzykmeans dirichlet minhash)
+algorithm=( kmeans fuzzykmeans dirichlet lda minhash)
 if [ -n "$1" ]; then
   choice=$1
 else
@@ -47,7 +47,8 @@ else
   echo "1. ${algorithm[0]} clustering"
   echo "2. ${algorithm[1]} clustering"
   echo "3. ${algorithm[2]} clustering"
-  echo "4. ${algorithm[3]} clustering"
+  echo "4. ${algorithm[3]} clustering" 
+  echo "5. ${algorithm[4]} clustering"
   read -p "Enter your choice : " choice
 fi
 
@@ -159,6 +160,32 @@ elif [ "x$clustertype" == "xdirichlet" ]
     -dt sequencefile -b 100 -n 20 -sp 0 \
     && \
   cat ${WORK_DIR}/reuters-dirichlet/clusterdump
+elif [ "x$clustertype" == "xlda" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-lda -ow --maxDFPercent 85 --namedVector \
+  && \
+  $MAHOUT rowid \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-lda/tfidf-vectors \
+    -o ${WORK_DIR}/reuters-out-matrix \
+  && \
+  rm -rf ${WORK_DIR}/reuters-lda ${WORK_DIR}/reuters-lda-topics ${WORK_DIR}/reuters-lda-model \
+  && \
+  $MAHOUT cvb \
+    -i ${WORK_DIR}/reuters-out-matrix/matrix \
+    -o ${WORK_DIR}/reuters-lda -k 20 -ow -x 20 \
+    -dict ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+    -dt ${WORK_DIR}/reuters-lda-topics \
+    -mt ${WORK_DIR}/reuters-lda-model \
+  && \
+  $MAHOUT vectordump \
+    -i ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+    -o ${WORK_DIR}/reuters-lda/vectordump \
+    -vs 10 -p true \
+    -d ${WORK_DIR}/reuters-out-seqdir-sparse-lda/dictionary.file-* \
+    -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
+    && \
+  cat ${WORK_DIR}/reuters-lda/vectordump
 elif [ "x$clustertype" == "xminhash" ]; then
   $MAHOUT seq2sparse \
     -i ${WORK_DIR}/reuters-out-seqdir/ \