You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2015/03/30 22:22:11 UTC
mahout git commit: MAHOUT-1559: Clean up wikipedia classifier example closes apache/mahout#90

Repository: mahout
Updated Branches:
  refs/heads/master 91c1626df -> d5d8de185


MAHOUT-1559: Clean up wikipedia classifier example closes apache/mahout#90


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d5d8de18
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d5d8de18
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d5d8de18

Branch: refs/heads/master
Commit: d5d8de1857d60b7b53b9baf0af6e7aea26bbde19
Parents: 91c1626
Author: Andrew Palumbo <ap...@apache.org>
Authored: Mon Mar 30 16:21:25 2015 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Mon Mar 30 16:21:25 2015 -0400

----------------------------------------------------------------------
 CHANGELOG                          |   2 +
 examples/bin/classify-wiki.sh      | 153 -------------------------
 examples/bin/classify-wikipedia.sh | 194 ++++++++++++++++++++++++++++++++
 3 files changed, 196 insertions(+), 153 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/d5d8de18/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 2660638..f4c31fa 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 0.10.0 - unreleased
 
+  MAHOUT-1559: Add documentation for and clean up the wikipedia classifier example (apalumbo)
+
   MAHOUT-1598: extend seq2sparse to handle multiple text blocks of same document (Wolfgang Buchnere via akm)
 
   MAHOUT-1659: Remove deprecated Lanczos solver from spectral clustering in mr-legacy (Shannon Quinn)

http://git-wip-us.apache.org/repos/asf/mahout/blob/d5d8de18/examples/bin/classify-wiki.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wiki.sh b/examples/bin/classify-wiki.sh
deleted file mode 100755
index 596dd7b..0000000
--- a/examples/bin/classify-wiki.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads a (partial) wikipedia dump, trains and tests a classifier.
-#
-# To run:  change into the mahout directory and type:
-# examples/bin/classify-wiki.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
-  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
-  exit
-fi
-
-# ensure that MAHOUT_HOME is set
-if [[ -z "$MAHOUT_HOME" ]]; then
-  echo "Please set MAHOUT_HOME."
-  exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
-  cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-  HADOOP="$HADOOP_HOME/bin/hadoop"
-  if [ ! -e $HADOOP ]; then
-    echo "Can't find hadoop in $HADOOP, exiting"
-    exit 1
-  fi
-fi
-
-WORK_DIR=/tmp/mahout-work-${USER}
-algorithm=( CBayes BinaryCBayes clean)
-if [ -n "$1" ]; then
-  choice=$1
-else
-  echo "Please select a number to choose the corresponding task to run"
-  echo "1. ${algorithm[0]}"
-  echo "2. ${algorithm[1]}"
-  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
-  read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-if [ "x$alg" != "xclean" ]; then
-  echo "creating work directory at ${WORK_DIR}"
-
-  mkdir -p ${WORK_DIR}
-    if [ ! -e ${WORK_DIR}/wikixml ]; then
-        mkdir -p ${WORK_DIR}/wikixml
-        echo "Downloading wikipedia XML dump"
-        ########################################################   
-        #  Datasets: uncomment and run "clean" to change dataset   
-        ########################################################
-        ########## partial small 42.5M zipped
-        #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2        
-        ########## partial larger 256M zipped
-        curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ######### full wikipedia dump: 10G zipped
-        #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
-        ########################################################
-      
-      echo "Extracting..."
-       
-      cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
-    fi
-
-echo $START_PATH
-
-set -e
-
-if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
-
-  set -x
-  echo "Preparing wikipedia data"
-  rm -rf ${WORK_DIR}/wiki
-  mkdir ${WORK_DIR}/wiki
-  
-  if [ "x$alg" == "xCBayes" ] ; then
-    # use a list of 10 countries as categories
-    cp $MAHOUT_HOME/examples/src/test/resources/country10.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-  
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    # use United States and United Kingdom as categories
-    cp $MAHOUT_HOME/examples/src/test/resources/country2.txt ${WORK_DIR}/country.txt
-    chmod 666 ${WORK_DIR}/country.txt
-  fi
-
-  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
-    echo "Copying wikipedia data to HDFS"
-    set +e
-    $HADOOP dfs -rmr ${WORK_DIR}/wikixml
-    set -e
-    $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
-  fi
-
-  echo "Creating sequence files from wikiXML"
-  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml -o ${WORK_DIR}/wikipediainput
-   
-  # if using the 10 class problem use bigrams
-  if [ "x$alg" == "xCBayes" ] ; then
-    echo "Converting sequence files to vectors using bigrams"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput -o ${WORK_DIR}/wikipediaVecs -wt tfidf -lnorm -nv -ow -ng 2
-  fi
-  
-  # if using the 2 class problem try different options
-  if [ "x$alg" == "xBinaryCBayes" ] ; then
-    echo "Converting sequence files to vectors using 4-grams and a max Document Frequenct of 30"
-    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput -o ${WORK_DIR}/wikipediaVecs -wt tfidf -lnorm -nv -ow -ng 4 -x 30 
-  fi
-  
-  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
-  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ --trainingOutput ${WORK_DIR}/training --testOutput ${WORK_DIR}/testing -rp 20 -ow -seq -xm sequential
-
-  echo "Training Naive Bayes model"
-  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training -el -o ${WORK_DIR}/model -li ${WORK_DIR}/labelindex -ow -c
-
-  echo "Self testing on training set"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -c
-
-  echo "Testing on holdout set: Bayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -seq
-
- echo "Testing on holdout set: CBayes"
-  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output  -c -seq
-fi
-
-elif [ "x$alg" == "xclean" ]; then
-  rm -rf ${WORK_DIR}
-fi
-# Remove the work directory
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/mahout/blob/d5d8de18/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh
new file mode 100755
index 0000000..0bdb9a2
--- /dev/null
+++ b/examples/bin/classify-wikipedia.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run:  change into the mahout directory and type:
+# examples/bin/classify-wiki.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+  exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+  echo "Please set MAHOUT_HOME."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+  HADOOP="$HADOOP_HOME/bin/hadoop"
+  if [ ! -e $HADOOP ]; then
+    echo "Can't find hadoop in $HADOOP, exiting"
+    exit 1
+  fi
+fi
+
+WORK_DIR=/tmp/mahout-work-wiki
+algorithm=( CBayes BinaryCBayes clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding task to run"
+  echo "1. ${algorithm[0]}"
+  echo "2. ${algorithm[1]}"
+  echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+  echo "creating work directory at ${WORK_DIR}"
+
+  mkdir -p ${WORK_DIR}
+    if [ ! -e ${WORK_DIR}/wikixml ]; then
+        mkdir -p ${WORK_DIR}/wikixml
+        echo "Downloading wikipedia XML dump"
+        ########################################################   
+        #  Datasets: uncomment and run "clean" to change dataset   
+        ########################################################
+        ########## partial small 42.5M zipped
+        #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2        
+        ########## partial larger 256M zipped
+        curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ######### full wikipedia dump: 10G zipped
+        #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+        ########################################################
+      
+      echo "Extracting..."
+       
+      cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+    fi
+
+echo $START_PATH
+
+set -e
+
+if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
+
+  set -x
+  echo "Preparing wikipedia data"
+  rm -rf ${WORK_DIR}/wiki
+  mkdir ${WORK_DIR}/wiki
+  
+  if [ "x$alg" == "xCBayes" ] ; then
+    # use a list of 10 countries as categories
+    cp $MAHOUT_HOME/examples/src/test/resources/country10.txt ${WORK_DIR}/country.txt
+    chmod 666 ${WORK_DIR}/country.txt
+  fi
+  
+  if [ "x$alg" == "xBinaryCBayes" ] ; then
+    # use United States and United Kingdom as categories
+    cp $MAHOUT_HOME/examples/src/test/resources/country2.txt ${WORK_DIR}/country.txt
+    chmod 666 ${WORK_DIR}/country.txt
+  fi
+
+  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    echo "Copying wikipedia data to HDFS"
+    set +e
+    $HADOOP dfs -rmr ${WORK_DIR}/wikixml
+    $HADOOP dfs -mkdir ${WORK_DIR}
+    set -e
+    $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+  fi
+
+  echo "Creating sequence files from wikiXML"
+  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
+                                  -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
+                                  -o ${WORK_DIR}/wikipediainput
+   
+  # if using the 10 class problem use bigrams
+  if [ "x$alg" == "xCBayes" ] ; then
+    echo "Converting sequence files to vectors using bigrams"
+    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+                                       -o ${WORK_DIR}/wikipediaVecs \
+                                       -wt tfidf \
+                                       -lnorm -nv \
+                                       -ow -ng 2
+  fi
+  
+  # if using the 2 class problem try different options
+  if [ "x$alg" == "xBinaryCBayes" ] ; then
+    echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
+    $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+                                       -o ${WORK_DIR}/wikipediaVecs \
+                                       -wt tfidf \
+                                       -lnorm \
+                                       -nv \
+                                       -ow \
+                                       -ng 1 \
+                                       -x 30
+  fi
+  
+  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
+                                --trainingOutput ${WORK_DIR}/training \
+                                --testOutput ${WORK_DIR}/testing \
+                                -rp 20 \
+                                -ow \
+                                -seq \
+                                -xm sequential
+
+  echo "Training Naive Bayes model"
+  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
+                                  -el \
+                                  -o ${WORK_DIR}/model \
+                                  -li ${WORK_DIR}/labelindex \
+                                  -ow \
+                                  -c
+
+  echo "Self testing on training set"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
+                                 -m ${WORK_DIR}/model \
+                                 -l ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output \
+                                 -c
+
+  echo "Testing on holdout set: Bayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+                                 -m ${WORK_DIR}/model \
+                                 -l ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output \
+                                 -seq
+
+ echo "Testing on holdout set: CBayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+                                 -m ${WORK_DIR}/model -l \
+                                 ${WORK_DIR}/labelindex \
+                                 -ow \
+                                 -o ${WORK_DIR}/output  \
+                                 -c \
+                                 -seq
+fi
+
+elif [ "x$alg" == "xclean" ]; then
+  rm -rf ${WORK_DIR}
+fi
+# Remove the work directory
\ No newline at end of file