You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ss...@apache.org on 2014/05/18 16:33:56 UTC

svn commit: r1595627 - in /mahout/trunk: ./ examples/bin/ examples/src/test/resources/ integration/src/main/java/org/apache/mahout/text/ integration/src/main/java/org/apache/mahout/text/wikipedia/

Author: ssc
Date: Sun May 18 14:33:55 2014
New Revision: 1595627

URL: http://svn.apache.org/r1595627
Log:
MAHOUT-1527 Fix wikipedia classifier example

Added:
    mahout/trunk/examples/bin/classify-wiki.sh   (with props)
    mahout/trunk/examples/src/test/resources/country10.txt
Modified:
    mahout/trunk/CHANGELOG
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
    mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java

Modified: mahout/trunk/CHANGELOG
URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1595627&r1=1595626&r2=1595627&view=diff
==============================================================================
--- mahout/trunk/CHANGELOG (original)
+++ mahout/trunk/CHANGELOG Sun May 18 14:33:55 2014
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 1.0 - unreleased
 
+  MAHOUT-1527: Fix wikipedia classifier example (Andrew Palumbo via ssc)
+
   MAHOUT-1542: Tutorial for playing with Mahout's Spark shell (ssc)
 
   MAHOUT-1532: Add solve() function to the Scala DSL (ssc)

Added: mahout/trunk/examples/bin/classify-wiki.sh
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/bin/classify-wiki.sh?rev=1595627&view=auto
==============================================================================
--- mahout/trunk/examples/bin/classify-wiki.sh (added)
+++ mahout/trunk/examples/bin/classify-wiki.sh Sun May 18 14:33:55 2014
@@ -0,0 +1,133 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+# Downloads a (partial) wikipedia dump, trains and tests a classifier.
+#
+# To run:  change into the mahout directory and type:
+# examples/bin/classify-wiki.sh
+
+if [ "$1" = "--help" ] || [ "$1" = "--?" ]; then
+  echo "This script Bayes and CBayes classifiers over the last wikipedia dump."
+  exit
+fi
+
+# ensure that MAHOUT_HOME is set
+if [[ -z "$MAHOUT_HOME" ]]; then
+  echo "Please set MAHOUT_HOME."
+  exit
+fi
+
+SCRIPT_PATH=${0%/*}
+if [ "$0" != "$SCRIPT_PATH" ] && [ "$SCRIPT_PATH" != "" ]; then
+  cd $SCRIPT_PATH
+fi
+START_PATH=`pwd`
+
+if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+  HADOOP="$HADOOP_HOME/bin/hadoop"
+  if [ ! -e $HADOOP ]; then
+    echo "Can't find hadoop in $HADOOP, exiting"
+    exit 1
+  fi
+fi
+
+WORK_DIR=/tmp/mahout-work-wiki-${USER}
+algorithm=( CBayes clean)
+if [ -n "$1" ]; then
+  choice=$1
+else
+  echo "Please select a number to choose the corresponding task to run"
+  echo "1. ${algorithm[0]}"
+  echo "2. ${algorithm[1]} -- cleans up the work area in $WORK_DIR"
+  read -p "Enter your choice : " choice
+fi
+
+echo "ok. You chose $choice and we'll use ${algorithm[$choice-1]}"
+alg=${algorithm[$choice-1]}
+
+if [ "x$alg" != "xclean" ]; then
+  echo "creating work directory at ${WORK_DIR}"
+
+  mkdir -p ${WORK_DIR}
+    if [ ! -e ${WORK_DIR}/wikixml ]; then
+        mkdir -p ${WORK_DIR}/wikixml
+        echo "Downloading wikipedia XML dump"        
+        ########## partial small
+         #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2        
+         ########## partial larger - uncomment and cle
+         curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles10.xml-p000925001p001325000.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+      
+         ######### Uncomment for full wikipedia dump: 10G zipped
+         #curl http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 -o ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml.bz2
+      
+      echo "Extracting..."
+       
+      cd ${WORK_DIR}/wikixml && bunzip2 enwiki-latest-pages-articles.xml.bz2 && cd .. && cd ..
+    fi
+
+echo $START_PATH
+#cd $START_PATH
+#cd ../..
+
+set -e
+
+if [ "x$alg" == "xCBayes" ]; then
+
+  set -x
+  echo "Preparing wikipedia data"
+  rm -rf ${WORK_DIR}/wiki
+  mkdir ${WORK_DIR}/wiki
+  cp $MAHOUT_HOME/examples/src/test/resources/country10.txt ${WORK_DIR}/country10.txt
+  chmod 666 ${WORK_DIR}/country10.txt
+
+  if [ "$HADOOP_HOME" != "" ] && [ "$MAHOUT_LOCAL" == "" ] ; then
+    echo "Copying wikipedia data to HDFS"
+    set +e
+    $HADOOP dfs -rmr ${WORK_DIR}/wikixml
+    set -e
+    $HADOOP dfs -put ${WORK_DIR}/wikixml ${WORK_DIR}/wikixml
+  fi
+
+  echo "Creating sequence files from wikiXML"
+  $MAHOUT_HOME/bin/mahout seqwiki -c ${WORK_DIR}/country10.txt -i ${WORK_DIR}/wikixml/enwiki-latest-pages-articles.xml -o ${WORK_DIR}/wikipediainput
+
+  echo "Converting sequence files to vectors using bi-grams"
+  $MAHOUT_HOME/bin/mahout seq2sparse -i ${WORK_DIR}/wikipediainput -o ${WORK_DIR}/wikipedidaVecs -wt tfidf -lnorm -nv -ow -ng 2
+
+  echo "Creating training and holdout set with a random 80-20 split of the generated vector dataset"
+  $MAHOUT_HOME/bin/mahout split -i ${WORK_DIR}/wikipedidaVecs/tfidf-vectors/ --trainingOutput ${WORK_DIR}/training --testOutput ${WORK_DIR}/testing -rp 20 -ow -seq -xm sequential
+
+  echo "Training Naive Bayes model"
+  $MAHOUT_HOME/bin/mahout trainnb -i ${WORK_DIR}/training -el -o ${WORK_DIR}/model -li ${WORK_DIR}/labelindex -ow -c
+
+  echo "Self testing on training set"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/training -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -c
+
+  echo "Testing on holdout set: Bayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output -seq
+
+ echo "Testing on holdout set: CBayes"
+  $MAHOUT_HOME/bin/mahout testnb -i ${WORK_DIR}/testing -m ${WORK_DIR}/model -l ${WORK_DIR}/labelindex -ow -o ${WORK_DIR}/output  -c -seq
+fi
+elif [ "x$alg" == "xclean" ]; then
+  rm -rf ${WORK_DIR}
+  rm -rf /tmp/news-group.model
+fi
+# Remove the work directory
+#

Propchange: mahout/trunk/examples/bin/classify-wiki.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: mahout/trunk/examples/src/test/resources/country10.txt
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/test/resources/country10.txt?rev=1595627&view=auto
==============================================================================
--- mahout/trunk/examples/src/test/resources/country10.txt (added)
+++ mahout/trunk/examples/src/test/resources/country10.txt Sun May 18 14:33:55 2014
@@ -0,0 +1,10 @@
+Australia
+Austria
+Bahamas
+Canada
+Colombia
+Cuba
+Panama
+Pakistan
+United Kingdom
+Vietnam

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java?rev=1595627&r1=1595626&r2=1595627&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java Sun May 18 14:33:55 2014
@@ -68,7 +68,7 @@ public final class WikipediaToSequenceFi
    * {@link org.apache.hadoop.io.SequenceFile}</li>
    * </ol>
    */
-  public static void main(String[] args) throws IOException {
+  public static void main(String[] args)  throws IOException {
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
@@ -159,6 +159,19 @@ public final class WikipediaToSequenceFi
              "org.apache.hadoop.io.serializer.JavaSerialization,"
              + "org.apache.hadoop.io.serializer.WritableSerialization");
     
+    Set<String> categories = Sets.newHashSet();
+    if (!catFile.isEmpty()) {
+      for (String line : new FileLineIterable(new File(catFile))) {
+        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
+      }
+    }
+    
+    Stringifier<Set<String>> setStringifier =
+        new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));
+    
+    String categoriesStr = setStringifier.toString(categories);    
+    conf.set("wikipedia.categories", categoriesStr);
+    
     Job job = new Job(conf);
     log.info("Input: {} Out: {} Categories: {} All Files: {}", input, output, catFile, all);
     job.setOutputKeyClass(Text.class);
@@ -178,24 +191,11 @@ public final class WikipediaToSequenceFi
      * "BLOCK"); conf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
      */
     HadoopUtil.delete(conf, outPath);
-    
-    Set<String> categories = Sets.newHashSet();
-    if (!catFile.isEmpty()) {
-      for (String line : new FileLineIterable(new File(catFile))) {
-        categories.add(line.trim().toLowerCase(Locale.ENGLISH));
-      }
-    }
-    
-    Stringifier<Set<String>> setStringifier =
-        new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(categories));
-    
-    String categoriesStr = setStringifier.toString(categories);
-    
-    conf.set("wikipedia.categories", categoriesStr);
-    
+
     boolean succeeded = job.waitForCompletion(true);
     if (!succeeded) {
       throw new IllegalStateException("Job failed!");
     }
+  
   }
 }

Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java?rev=1595627&r1=1595626&r2=1595627&view=diff
==============================================================================
--- mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java (original)
+++ mahout/trunk/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java Sun May 18 14:33:55 2014
@@ -82,23 +82,27 @@ public class WikipediaMapper extends Map
         return;
       }
     }
-    document = StringEscapeUtils.unescapeHtml4(document);
-    context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_")), new Text(document));
+    String catMatch = findMatchingCategory(document);
+    document = StringEscapeUtils.unescapeHtml4(document);    
+    // write out in Bayes input style: key: /Category/document_name
+
+    String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
+        SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
+
+    context.write(new Text(category), new Text(document));
   }
 
   @Override
   protected void setup(Context context) throws IOException, InterruptedException {
     super.setup(context);
     Configuration conf = context.getConfiguration();
-    if (inputCategories == null) {
-      Set<String> newCategories = Sets.newHashSet();
-
-      DefaultStringifier<Set<String>> setStringifier =
+ 
+    Set<String> newCategories = Sets.newHashSet();
+    DefaultStringifier<Set<String>> setStringifier =
           new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories));
 
-      String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories));
-      inputCategories = setStringifier.fromString(categoriesStr);
-    }
+    String categoriesStr = conf.get("wikipedia.categories");
+    inputCategories = setStringifier.fromString(categoriesStr);
     exactMatchOnly = conf.getBoolean("exact.match.only", false);
     all = conf.getBoolean("all.files", true);
     log.info("Configure: Input Categories size: {} All: {} Exact Match: {}",
@@ -127,12 +131,12 @@ public class WikipediaMapper extends Map
       }
       String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim();
       if (exactMatchOnly && inputCategories.contains(category)) {
-        return category;
+        return category.toLowerCase(Locale.ENGLISH);
       }
       if (!exactMatchOnly) {
         for (String inputCategory : inputCategories) {
           if (category.contains(inputCategory)) { // we have an inexact match
-            return inputCategory;
+            return inputCategory.toLowerCase(Locale.ENGLISH);
           }
         }
       }