You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by sm...@apache.org on 2013/11/28 00:27:11 UTC

svn commit: r1546232 - /mahout/trunk/examples/bin/cluster-reuters.sh

Author: smarthi
Date: Wed Nov 27 23:27:11 2013
New Revision: 1546232

URL: http://svn.apache.org/r1546232
Log:
MAHOUT-1347: Add Streaming K-Means clustering algorithm to examples/bin/cluster-reuters.sh

Modified:
    mahout/trunk/examples/bin/cluster-reuters.sh

Modified: mahout/trunk/examples/bin/cluster-reuters.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/cluster-reuters.sh?rev=1546232&r1=1546231&r2=1546232&view=diff
==============================================================================
--- mahout/trunk/examples/bin/cluster-reuters.sh (original)
+++ mahout/trunk/examples/bin/cluster-reuters.sh Wed Nov 27 23:27:11 2013
@@ -39,14 +39,15 @@ if [ ! -e $MAHOUT ]; then
   exit 1
 fi
 
-algorithm=( kmeans fuzzykmeans lda)
+algorithm=( kmeans fuzzykmeans lda streamingkmeans)
 if [ -n "$1" ]; then
   choice=$1
 else
   echo "Please select a number to choose the corresponding clustering algorithm"
   echo "1. ${algorithm[0]} clustering"
   echo "2. ${algorithm[1]} clustering"
-  echo "3. ${algorithm[2]} clustering" 
+  echo "3. ${algorithm[2]} clustering"
+  echo "4. ${algorithm[3]} clustering"
   read -p "Enter your choice : " choice
 fi
 
@@ -169,6 +170,23 @@ elif [ "x$clustertype" == "xlda" ]; then
     -dt sequencefile -sort ${WORK_DIR}/reuters-lda-topics/part-m-00000 \
     && \
   cat ${WORK_DIR}/reuters-lda/vectordump
+elif [ "x$clustertype" == "xstreamingkmeans" ]; then
+  $MAHOUT seq2sparse \
+    -i ${WORK_DIR}/reuters-out-seqdir/ \
+    -o ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans -ow --maxDFPercent 85 --namedVector \
+  && \
+  rm -rf ${WORK_DIR}/reuters-streamingkmeans \
+  && \
+  $MAHOUT streamingkmeans \
+    -i ${WORK_DIR}/reuters-out-seqdir-sparse-streamingkmeans/tfidf-vectors/ \
+    --tempDir ${WORK_DIR}/tmp \
+    -o ${WORK_DIR}/reuters-streamingkmeans \
+    -sc org.apache.mahout.math.neighborhood.FastProjectionSearch \
+    -dm org.apache.mahout.common.distance.SquaredEuclideanDistanceMeasure \
+    -k 20 -km 200 -rskm \
+  && \
+  $MAHOUT seqdumper \
+    -i ${WORK_DIR}/reuters-streamingkmeans
 else 
   echo "unknown cluster type: $clustertype"
 fi