You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2014/05/18 16:33:56 UTC
svn commit: r1595627 - in /mahout/trunk: ./ examples/bin/
examples/src/test/resources/
integration/src/main/java/org/apache/mahout/text/
integration/src/main/java/org/apache/mahout/text/wikipedia/
Author: ssc
Date: Sun May 18 14:33:55 2014
New Revision: 1595627
URL: http://svn.apache.org/r1595627
Log:
MAHOUT-1527 Fix wikipedia classifier example
Added:
mahout/trunk/examples/bin/classify-wiki.sh (with props)
mahout/trunk/examples/src/test/resources/country10.txt
Modified:
mahout/trunk/CHANGELOG
mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1595627&r1=1595626&r2=1595627&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun May 18 14:33:55 2014
@@ -2,6 +2,8 @@ Mahout Change Log
Release 1.0 - unreleased
+ MAHOUT-1527: Fix wikipedia classifier example (Andrew Palumbo via ssc)
+
MAHOUT-1542: Tutorial for playing with Mahout's Spark shell (ssc)
MAHOUT-1532: Add solve() function to the Scala DSL (ssc)
Added: mahout/trunk/examples/bin/classify-wiki.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/classify-wiki.sh?rev=1595627&view=auto
==============================================================================
--- mahout/trunk/examples/bin/classify-wiki.sh (added)
+++ mahout/trunk/examples/bin/classify-wiki.sh Sun May 18 14:33:55 2014
@@ -0,0 +1,133 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run: change into the mahout directory and type:
+# examples/bin/classify-wiki.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+ echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+ exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+ echo "Please set MAHOUT_HOME."
+ exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+ cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ HADOOP="$HADOOP_HOME/bin/hadoop"
+ if [ ! -e $HADOOP ]; then
+ echo "Can't find hadoop in $HADOOP, exiting"
+ exit 1
+ fi
+fi
+
+WORK_DIR=/tmp/mahout-work-wiki-${USER}
+algorithm=( CBayes clean)
+if [ -n "$1" ]; then
+ choice=$1
+else
+ echo "Please select a number to choose the corresponding task to run"
+ echo "1. ${algorithm[0]}"
+ echo "2. ${algorithm[1]} -- cleans up the work area in $WORK_DIR"
+ read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+ echo "creating work directory at ${WORK_DIR}"
+
+ mkdir -p ${WORK_DIR}
+ if [ ! -e ${WORK_DIR}/wikixml ]; then
+ mkdir -p ${WORK_DIR}/wikixml
+ echo "Downloading wikipedia XML dump"
+ ########## partial small
+ #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+ ########## partial larger - uncomment and cle
+ curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+
+ ######### Uncomment for full wikipedia dump: 10G zipped
+ #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+
+ echo "Extracting..."
+
+ cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+ fi
+
+echo $START_PATH
+#cd $START_PATH
+#cd ../..
+
+set -e
+
+if [ "x$alg" == "xCBayes" ]; then
+
+ set -x
+ echo "Preparing wikipedia data"
+ rm -rf ${WORK_DIR}/wiki
+ mkdir ${WORK_DIR}/wiki
+ cp $MAHOUT_HOME/examples/src/test/resources/country10.txt ${WORK_DIR}/country10.txt
+ chmod 666 ${WORK_DIR}/country10.txt
+
+ if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+ echo "Copying wikipedia data to HDFS"
+ set +e
+ $HADOOP dfs -rmr ${WORK_DIR}/wikixml
+ set -e
+ $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+ fi
+
+ echo "Creating sequence files from wikiXML"
+ $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country10.txt -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml -o ${WORK_DIR}/wikipediainput
+
+ echo "Converting sequence files to vectors using bi-grams"
+ $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput -o ${WORK_DIR}/wikipedidaVecs -wt tfidf -lnorm -nv -ow -ng 2
+
+ echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+ $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipedidaVecs/tfidf-vectors/ --trainingOutput ${WORK_DIR}/training --testOutput ${WORK_DIR}/testing -rp 20 -ow -seq -xm sequential
+
+ echo "Training Naive Bayes model"
+ $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training -el -o ${WORK_DIR}/model -li ${WORK_DIR}/labelindex -ow -c
+
+ echo "Self testing on training set"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -c
+
+ echo "Testing on holdout set: Bayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -seq
+
+ echo "Testing on holdout set: CBayes"
+ $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -c -seq
+fi
+elif [ "x$alg" == "xclean" ]; then
+ rm -rf ${WORK_DIR}
+ rm -rf /tmp/news-group.model
+fi
+# Remove the work directory
+#
Propchange: mahout/trunk/examples/bin/classify-wiki.sh
------------------------------------------------------------------------------
svn:executable = *
Added: mahout/trunk/examples/src/test/resources/country10.txt
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/resources/country10.txt?rev=1595627&view=auto
==============================================================================
--- mahout/trunk/examples/src/test/resources/country10.txt (added)
+++ mahout/trunk/examples/src/test/resources/country10.txt Sun May 18 14:33:55 2014
@@ -0,0 +1,10 @@
+Australia
+Austria
+Bahamas
+Canada
+Colombia
+Cuba
+Panama
+Pakistan
+United Kingdom
+Vietnam
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java?rev=1595627&r1=1595626&r2=1595627&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java Sun May 18 14:33:55 2014
@@ -68,7 +68,7 @@ public final class WikipediaToSequenceFi
* {@link org.apache.hadoop.io.SequenceFile}</li>
* </ol>
*/
- public static void main(String[] args) throws IOException {
+ public static void main(String[] args) throws IOException {
DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
ArgumentBuilder abuilder = new ArgumentBuilder();
GroupBuilder gbuilder = new GroupBuilder();
@@ -159,6 +159,19 @@ public final class WikipediaToSequenceFi
"org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
+ Set<String> categories = Sets.newHashSet();
+ if (!catFile.isEmpty()) {
+ for (String line : new FileLineIterable(new File(catFile))) {
+ categories.add(line.trim().toLowerCase(Locale.ENGLISH));
+ }
+ }
+
+ Stringifier<Set<String>> setStringifier =
+ new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));
+
+ String categoriesStr = setStringifier.toString(categories);
+ conf.set("wikipedia.categories", categoriesStr);
+
Job job = new Job(conf);
log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all);
job.setOutputKeyClass(Text.class);
@@ -178,24 +191,11 @@ public final class WikipediaToSequenceFi
* "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
*/
HadoopUtil.delete(conf, outPath);
-
- Set<String> categories = Sets.newHashSet();
- if (!catFile.isEmpty()) {
- for (String line : new FileLineIterable(new File(catFile))) {
- categories.add(line.trim().toLowerCase(Locale.ENGLISH));
- }
- }
-
- Stringifier<Set<String>> setStringifier =
- new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));
-
- String categoriesStr = setStringifier.toString(categories);
-
- conf.set("wikipedia.categories", categoriesStr);
-
+
boolean succeeded = job.waitForCompletion(true);
if (!succeeded) {
throw new IllegalStateException("Job failed!");
}
+
}
}
Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java?rev=1595627&r1=1595626&r2=1595627&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java Sun May 18 14:33:55 2014
@@ -82,23 +82,27 @@ public class WikipediaMapper extends Map
return;
}
}
- document = StringEscapeUtils.unescapeHtml4(document);
- context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_")), new Text(document));
+ String catMatch = findMatchingCategory(document);
+ document = StringEscapeUtils.unescapeHtml4(document);
+ // write out in Bayes input style: key: /Category/document_name
+
+ String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
+ SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
+
+ context.write(new Text(category), new Text(document));
}
@Override
protected void setup(Context context) throws IOException, InterruptedException {
super.setup(context);
Configuration conf = context.getConfiguration();
- if (inputCategories == null) {
- Set<String> newCategories = Sets.newHashSet();
-
- DefaultStringifier<Set<String>> setStringifier =
+
+ Set<String> newCategories = Sets.newHashSet();
+ DefaultStringifier<Set<String>> setStringifier =
new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories));
- String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories));
- inputCategories = setStringifier.fromString(categoriesStr);
- }
+ String categoriesStr = conf.get("wikipedia.categories");
+ inputCategories = setStringifier.fromString(categoriesStr);
exactMatchOnly = conf.getBoolean("exact.match.only", false);
all = conf.getBoolean("all.files", true);
log.info("Configure: Input Categories size: {} All: {} Exact Match: {}",
@@ -127,12 +131,12 @@ public class WikipediaMapper extends Map
}
String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
if (exactMatchOnly && inputCategories.contains(category)) {
- return category;
+ return category.toLowerCase(Locale.ENGLISH);
}
if (!exactMatchOnly) {
for (String inputCategory : inputCategories) {
if (category.contains(inputCategory)) { // we have an inexact match
- return inputCategory;
+ return inputCategory.toLowerCase(Locale.ENGLISH);
}
}
}