You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2015/03/30 22:22:11 UTC
mahout git commit: MAHOUT-1559: Clean up wikipedia classifier example
closes apache/mahout#90
Repository: mahout
Updated Branches:
refs/heads/master 91c1626df -> d5d8de185
MAHOUT-1559: Clean up wikipedia classifier example closes apache/mahout#90
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/d5d8de18
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/d5d8de18
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/d5d8de18
Branch: refs/heads/master
Commit: d5d8de1857d60b7b53b9baf0af6e7aea26bbde19
Parents: 91c1626
Author: Andrew Palumbo <ap...@apache.org>
Authored: Mon Mar 30 16:21:25 2015 -0400
Committer: Andrew Palumbo <ap...@apache.org>
Committed: Mon Mar 30 16:21:25 2015 -0400
----------------------------------------------------------------------
CHANGELOG | 2 +
examples/bin/classify-wiki.sh | 153 -------------------------
examples/bin/classify-wikipedia.sh | 194 ++++++++++++++++++++++++++++++++
3 files changed, 196 insertions(+), 153 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/d5d8de18/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index 2660638..f4c31fa 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
Release 0.10.0 - unreleased
+ MAHOUT-1559: Add documentation for and clean up the wikipedia classifier example (apalumbo)
+
MAHOUT-1598: extend seq2sparse to handle multiple text blocks of same document (Wolfgang Buchnere via akm)
MAHOUT-1659: Remove deprecated Lanczos solver from spectral clustering in mr-legacy (Shannon Quinn)
http://git-wip-us.apache.org/repos/asf/mahout/blob/d5d8de18/examples/bin/classify-wiki.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wiki.sh b/examples/bin/classify-wiki.sh
deleted file mode 100755
index 596dd7b..0000000
--- a/examples/bin/classify-wiki.sh
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-#
-# Downloads a (partial) wikipedia dump, trains and tests a classifier.
-#
-# To run: change into the mahout directory and type:
-# examples/bin/classify-wiki.sh
-
-if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
- echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
- exit
-fi
-
-# ensure that MAHOUT_HOME is set
-if [[ -z "$MAHOUT_HOME" ]]; then
- echo "Please set MAHOUT_HOME."
- exit
-fi
-
-SCRIPT_PATH=${0%/*}
-if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
- cd $SCRIPT_PATH
-fi
-START_PATH=`pwd`
-
-if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- HADOOP="$HADOOP_HOME/bin/hadoop"
- if [ ! -e $HADOOP ]; then
- echo "Can't find hadoop in $HADOOP, exiting"
- exit 1
- fi
-fi
-
-WORK_DIR=/tmp/mahout-work-${USER}
-algorithm=( CBayes BinaryCBayes clean)
-if [ -n "$1" ]; then
- choice=$1
-else
- echo "Please select a number to choose the corresponding task to run"
- echo "1. ${algorithm[0]}"
- echo "2. ${algorithm[1]}"
- echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
- read -p "Enter your choice : " choice
-fi
-
-echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
-alg=${algorithm[$choice-1]}
-
-if [ "x$alg" != "xclean" ]; then
- echo "creating work directory at ${WORK_DIR}"
-
- mkdir -p ${WORK_DIR}
- if [ ! -e ${WORK_DIR}/wikixml ]; then
- mkdir -p ${WORK_DIR}/wikixml
- echo "Downloading wikipedia XML dump"
- ########################################################
- # Datasets: uncomment and run "clean" to change dataset
- ########################################################
- ########## partial small 42.5M zipped
- #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
- ########## partial larger 256M zipped
- curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
- ######### full wikipedia dump: 10G zipped
- #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
- ########################################################
-
- echo "Extracting..."
-
- cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
- fi
-
-echo $START_PATH
-
-set -e
-
-if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
-
- set -x
- echo "Preparing wikipedia data"
- rm -rf ${WORK_DIR}/wiki
- mkdir ${WORK_DIR}/wiki
-
- if [ "x$alg" == "xCBayes" ] ; then
- # use a list of 10 countries as categories
- cp $MAHOUT_HOME/examples/src/test/resources/country10.txt ${WORK_DIR}/country.txt
- chmod 666 ${WORK_DIR}/country.txt
- fi
-
- if [ "x$alg" == "xBinaryCBayes" ] ; then
- # use United States and United Kingdom as categories
- cp $MAHOUT_HOME/examples/src/test/resources/country2.txt ${WORK_DIR}/country.txt
- chmod 666 ${WORK_DIR}/country.txt
- fi
-
- if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
- echo "Copying wikipedia data to HDFS"
- set +e
- $HADOOP dfs -rmr ${WORK_DIR}/wikixml
- set -e
- $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
- fi
-
- echo "Creating sequence files from wikiXML"
- $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml -o ${WORK_DIR}/wikipediainput
-
- # if using the 10 class problem use bigrams
- if [ "x$alg" == "xCBayes" ] ; then
- echo "Converting sequence files to vectors using bigrams"
- $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput -o ${WORK_DIR}/wikipediaVecs -wt tfidf -lnorm -nv -ow -ng 2
- fi
-
- # if using the 2 class problem try different options
- if [ "x$alg" == "xBinaryCBayes" ] ; then
- echo "Converting sequence files to vectors using 4-grams and a max Document Frequenct of 30"
- $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput -o ${WORK_DIR}/wikipediaVecs -wt tfidf -lnorm -nv -ow -ng 4 -x 30
- fi
-
- echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
- $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ --trainingOutput ${WORK_DIR}/training --testOutput ${WORK_DIR}/testing -rp 20 -ow -seq -xm sequential
-
- echo "Training Naive Bayes model"
- $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training -el -o ${WORK_DIR}/model -li ${WORK_DIR}/labelindex -ow -c
-
- echo "Self testing on training set"
- $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -c
-
- echo "Testing on holdout set: Bayes"
- $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -seq
-
- echo "Testing on holdout set: CBayes"
- $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -c -seq
-fi
-
-elif [ "x$alg" == "xclean" ]; then
- rm -rf ${WORK_DIR}
-fi
-# Remove the work directory
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/mahout/blob/d5d8de18/examples/bin/classify-wikipedia.sh
----------------------------------------------------------------------
diff --git a/examples/bin/classify-wikipedia.sh b/examples/bin/classify-wikipedia.sh
new file mode 100755
index 0000000..0bdb9a2
--- /dev/null
+++ b/examples/bin/classify-wikipedia.sh
@@ -0,0 +1,194 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/classify-wiki.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+ exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+ echo "Please set MAHOUT_HOME."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ HADOOP="$HADOOP_HOME/bin/hadoop"
+ if [ ! -e $HADOOP ]; then
+ echo "Can't find hadoop in $HADOOP, exiting"
+ exit 1
+ fi
+fi
+
+WORK_DIR=/tmp/mahout-work-wiki
+algorithm=( CBayes BinaryCBayes clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding task to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]}"
+ echo "3. ${algorithm[2]} -- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+ echo "creating work directory at ${WORK_DIR}"
+
+ mkdir -p ${WORK_DIR}
+ if [ ! -e ${WORK_DIR}/wikixml ]; then
+ mkdir -p ${WORK_DIR}/wikixml
+ echo "Downloading wikipedia XML dump"
+ ########################################################
+ # Datasets: uncomment and run "clean" to change dataset
+ ########################################################
+ ########## partial small 42.5M zipped
+ #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########## partial larger 256M zipped
+ curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ######### full wikipedia dump: 10G zipped
+ #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########################################################
+
+ echo "Extracting..."
+
+ cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+ fi
+
+echo $START_PATH
+
+set -e
+
+if [ "x$alg" == "xCBayes" ] || [ "x$alg" == "xBinaryCBayes" ] ; then
+
+ set -x
+ echo "Preparing wikipedia data"
+ rm -rf ${WORK_DIR}/wiki
+ mkdir ${WORK_DIR}/wiki
+
+ if [ "x$alg" == "xCBayes" ] ; then
+ # use a list of 10 countries as categories
+ cp $MAHOUT_HOME/examples/src/test/resources/country10.txt ${WORK_DIR}/country.txt
+ chmod 666 ${WORK_DIR}/country.txt
+ fi
+
+ if [ "x$alg" == "xBinaryCBayes" ] ; then
+ # use United States and United Kingdom as categories
+ cp $MAHOUT_HOME/examples/src/test/resources/country2.txt ${WORK_DIR}/country.txt
+ chmod 666 ${WORK_DIR}/country.txt
+ fi
+
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying wikipedia data to HDFS"
+ set +e
+ $HADOOP dfs -rmr ${WORK_DIR}/wikixml
+ $HADOOP dfs -mkdir ${WORK_DIR}
+ set -e
+ $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+ fi
+
+ echo "Creating sequence files from wikiXML"
+ $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country.txt \
+ -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml \
+ -o ${WORK_DIR}/wikipediainput
+
+ # if using the 10 class problem use bigrams
+ if [ "x$alg" == "xCBayes" ] ; then
+ echo "Converting sequence files to vectors using bigrams"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+ -o ${WORK_DIR}/wikipediaVecs \
+ -wt tfidf \
+ -lnorm -nv \
+ -ow -ng 2
+ fi
+
+ # if using the 2 class problem try different options
+ if [ "x$alg" == "xBinaryCBayes" ] ; then
+ echo "Converting sequence files to vectors using unigrams and a max document frequency of 30%"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput \
+ -o ${WORK_DIR}/wikipediaVecs \
+ -wt tfidf \
+ -lnorm \
+ -nv \
+ -ow \
+ -ng 1 \
+ -x 30
+ fi
+
+ echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+ $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipediaVecs/tfidf-vectors/ \
+ --trainingOutput ${WORK_DIR}/training \
+ --testOutput ${WORK_DIR}/testing \
+ -rp 20 \
+ -ow \
+ -seq \
+ -xm sequential
+
+ echo "Training Naive Bayes model"
+ $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training \
+ -el \
+ -o ${WORK_DIR}/model \
+ -li ${WORK_DIR}/labelindex \
+ -ow \
+ -c
+
+ echo "Self testing on training set"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training \
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -c
+
+ echo "Testing on holdout set: Bayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+ -m ${WORK_DIR}/model \
+ -l ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -seq
+
+ echo "Testing on holdout set: CBayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing \
+ -m ${WORK_DIR}/model -l \
+ ${WORK_DIR}/labelindex \
+ -ow \
+ -o ${WORK_DIR}/output \
+ -c \
+ -seq
+fi
+
+elif [ "x$alg" == "xclean" ]; then
+ rm -rf ${WORK_DIR}
+fi
+# Remove the work directory
\ No newline at end of file