You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by je...@apache.org on 2010/05/10 16:55:20 UTC

svn commit: r942766 - /lucene/mahout/trunk/examples/bin/build-reuters.sh

Author: jeastman
Date: Mon May 10 14:55:19 2010
New Revision: 942766

URL: http://svn.apache.org/viewvc?rev=942766&view=rev
Log:
Replacing obsolete build-reuters.sh with one that seems to be working

Modified:
    lucene/mahout/trunk/examples/bin/build-reuters.sh

Modified: lucene/mahout/trunk/examples/bin/build-reuters.sh
URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/bin/build-reuters.sh?rev=942766&r1=942765&r2=942766&view=diff
==============================================================================
--- lucene/mahout/trunk/examples/bin/build-reuters.sh (original)
+++ lucene/mahout/trunk/examples/bin/build-reuters.sh Mon May 10 14:55:19 2010
@@ -20,8 +20,9 @@
 #
 # To run:  change into the mahout/examples directory (the parent of the one containing this file) and type:
 #  bin/build-reuters.sh
-#
-#
+#!/bin/sh
+
+cd examples/bin/
 mkdir -p work
 if [ ! -e work/reuters-out ]; then
   if [ ! -e work/reuters-sgm ]; then
@@ -33,28 +34,10 @@ if [ ! -e work/reuters-out ]; then
     echo "Extracting..."
     cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
   fi
-  echo "Converting to plain text."
-  mvn -e -q exec:java  -Dexec.mainClass="org.apache.lucene.benchmark.utils.ExtractReuters" -Dexec.args="work/reuters-sgm work/reuters-out" || exit
-fi
-# Create index
-if [ ! -e work/index ]; then
-  echo "Creating index";
-  mvn -e exec:java -Dexec.classpathScope="test" -Dexec.mainClass="org.apache.lucene.benchmark.byTask.Benchmark" -Dexec.args="bin/lda.algorithm" || ( rm -rf work/index && exit )
-fi
-if [ ! -e work/vectors ]; then
-  echo "Creating vectors from index"
-  cd ../core
-  mvn -q install -DskipTests=true
-  cd ../utils/
-  mvn -q compile
-  mvn -e exec:java -Dexec.mainClass="org.apache.mahout.utils.vectors.lucene.Driver" \
-    -Dexec.args="--dir ../examples/work/index/ --field body --dictOut ../examples/work/dict.txt \
-    --output ../examples/work/vectors --minDF 100 --maxDFPercent 97" || exit
-  cd ../core/
 fi
-echo "Running LDA"
-rm -rf ../examples/work/lda
-MAVEN_OPTS="-Xmx2G -ea" mvn -e exec:java -Dexec.mainClass=org.apache.mahout.clustering.lda.LDADriver -Dexec.args="-i ../examples/work/vectors -o ../examples/work/lda/\
-  -k 20 -v 10000 --maxIter 40"
-echo "Writing top words for each topic to to examples/work/topics/"
-mvn -q exec:java -Dexec.mainClass="org.apache.mahout.clustering.lda.LDAPrintTopics" -Dexec.args="-i `ls -1dtr ../examples/work/lda/state-* | tail -1` -d ../examples/work/dict.txt -o ../examples/work/topics/ -w 100"
+
+cd ../..
+./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
+./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o ./examples/bin/work/reuters-out-seqdir -c UTF-8
+./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o ./examples/bin/work/reuters-out-seqdir-sparse
+./bin/mahout kmeans -i ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -w



Re: svn commit: r942766 - /lucene/mahout/trunk/examples/bin/build-reuters.sh

Posted by Robin Anil <ro...@gmail.com>.
Remove the k-means part. Otherwise it will just launch clustering after
reuters is done.


On Mon, May 10, 2010 at 8:25 PM, <je...@apache.org> wrote:

> Author: jeastman
> Date: Mon May 10 14:55:19 2010
> New Revision: 942766
>
> URL: http://svn.apache.org/viewvc?rev=942766&view=rev
> Log:
> Replacing obsolete build-reuters.sh with one that seems to be working
>
> Modified:
>    lucene/mahout/trunk/examples/bin/build-reuters.sh
>
> Modified: lucene/mahout/trunk/examples/bin/build-reuters.sh
> URL:
> http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/bin/build-reuters.sh?rev=942766&r1=942765&r2=942766&view=diff
>
> ==============================================================================
> --- lucene/mahout/trunk/examples/bin/build-reuters.sh (original)
> +++ lucene/mahout/trunk/examples/bin/build-reuters.sh Mon May 10 14:55:19
> 2010
> @@ -20,8 +20,9 @@
>  #
>  # To run:  change into the mahout/examples directory (the parent of the
> one containing this file) and type:
>  #  bin/build-reuters.sh
> -#
> -#
> +#!/bin/sh
> +
> +cd examples/bin/
>  mkdir -p work
>  if [ ! -e work/reuters-out ]; then
>   if [ ! -e work/reuters-sgm ]; then
> @@ -33,28 +34,10 @@ if [ ! -e work/reuters-out ]; then
>     echo "Extracting..."
>     cd work/reuters-sgm && tar xzf ../reuters21578.tar.gz && cd .. && cd ..
>   fi
> -  echo "Converting to plain text."
> -  mvn -e -q exec:java
>  -Dexec.mainClass="org.apache.lucene.benchmark.utils.ExtractReuters"
> -Dexec.args="work/reuters-sgm work/reuters-out" || exit
> -fi
> -# Create index
> -if [ ! -e work/index ]; then
> -  echo "Creating index";
> -  mvn -e exec:java -Dexec.classpathScope="test"
> -Dexec.mainClass="org.apache.lucene.benchmark.byTask.Benchmark"
> -Dexec.args="bin/lda.algorithm" || ( rm -rf work/index && exit )
> -fi
> -if [ ! -e work/vectors ]; then
> -  echo "Creating vectors from index"
> -  cd ../core
> -  mvn -q install -DskipTests=true
> -  cd ../utils/
> -  mvn -q compile
> -  mvn -e exec:java
> -Dexec.mainClass="org.apache.mahout.utils.vectors.lucene.Driver" \
> -    -Dexec.args="--dir ../examples/work/index/ --field body --dictOut
> ../examples/work/dict.txt \
> -    --output ../examples/work/vectors --minDF 100 --maxDFPercent 97" ||
> exit
> -  cd ../core/
>  fi
> -echo "Running LDA"
> -rm -rf ../examples/work/lda
> -MAVEN_OPTS="-Xmx2G -ea" mvn -e exec:java
> -Dexec.mainClass=org.apache.mahout.clustering.lda.LDADriver -Dexec.args="-i
> ../examples/work/vectors -o ../examples/work/lda/\
> -  -k 20 -v 10000 --maxIter 40"
> -echo "Writing top words for each topic to to examples/work/topics/"
> -mvn -q exec:java
> -Dexec.mainClass="org.apache.mahout.clustering.lda.LDAPrintTopics"
> -Dexec.args="-i `ls -1dtr ../examples/work/lda/state-* | tail -1` -d
> ../examples/work/dict.txt -o ../examples/work/topics/ -w 100"
> +
> +cd ../..
> +./bin/mahout org.apache.lucene.benchmark.utils.ExtractReuters
> ./examples/bin/work/reuters-sgm/ ./examples/bin/work/reuters-out/
> +./bin/mahout seqdirectory -i ./examples/bin/work/reuters-out/ -o
> ./examples/bin/work/reuters-out-seqdir -c UTF-8
> +./bin/mahout seq2sparse -i ./examples/bin/work/reuters-out-seqdir/ -o
> ./examples/bin/work/reuters-out-seqdir-sparse
> +./bin/mahout kmeans -i
> ./examples/bin/work/reuters-out-seqdir-sparse/tfidf/vectors/ -c
> ./examples/bin/work/clusters -o ./examples/bin/work/reuters-kmeans -k 20 -w
>
>
>