You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2014/08/29 02:14:47 UTC

git commit: MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents. Remove redundant call to findMatchingCategory. closes apache/mahout#45

Repository: mahout
Updated Branches:
  refs/heads/master 03a5bb61e -> 6dd0c92dd


MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents. Remove redundant call to findMatchingCategory.  closes apache/mahout#45


Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/6dd0c92d
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/6dd0c92d
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/6dd0c92d

Branch: refs/heads/master
Commit: 6dd0c92ddb9dad2627e9ca5e28118865d6fba159
Parents: 03a5bb6
Author: Andrew Palumbo <ap...@outlook.com>
Authored: Thu Aug 28 20:06:13 2014 -0400
Committer: Andrew Palumbo <ap...@outlook.com>
Committed: Thu Aug 28 20:12:22 2014 -0400

----------------------------------------------------------------------
 CHANGELOG                                       |  2 +
 .../mahout/text/WikipediaToSequenceFile.java    | 23 ++++++++--
 .../mahout/text/wikipedia/WikipediaMapper.java  | 44 +++++++++++++++++---
 3 files changed, 59 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index dfccd95..310b5f0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
 
 Release 1.0 - unreleased
 
+  MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents (apalumbo)
+
   MAHOUT-1604: Spark version of rowsimilarity driver and associated additions to SimilarityAnalysis.scala (pferrel)
 
   MAHOUT-1500: H2O Integration (Anand Avati via apalumbo)

http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
index 7a1e40e..19f353c 100644
--- a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
+++ b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
@@ -88,12 +88,16 @@ public final class WikipediaToSequenceFile {
     
     Option allOpt = obuilder.withLongName("all")
         .withDescription("If set, Select all files. Default is false").withShortName("all").create();
-    
+
+    Option removeLabelOpt = obuilder.withLongName("removeLabels")
+        .withDescription("If set, remove [[Category:labels]] from document text after extracting label."
+          + "Default is false").withShortName("rl").create();
+
     Option helpOpt = DefaultOptionCreator.helpOption();
     
     Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
         .withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
-        .create();
+        .withOption(removeLabelOpt).create();
     
     Parser parser = new Parser();
     parser.setGroup(group);
@@ -117,7 +121,13 @@ public final class WikipediaToSequenceFile {
       if (cmdLine.hasOption(allOpt)) {
         all = true;
       }
-      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
+
+      boolean removeLabels = false;
+      if (cmdLine.hasOption(removeLabelOpt)) {
+          removeLabels = true;
+      }
+
+      runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels);
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
@@ -144,17 +154,22 @@ public final class WikipediaToSequenceFile {
    *          category string
    * @param all
    *          if true select all categories
+   * @param removeLabels
+   *          if true remove Category labels from document text after extracting.
+   *
    */
   public static void runJob(String input,
                             String output,
                             String catFile,
                             boolean exactMatchOnly,
-                            boolean all) throws IOException, InterruptedException, ClassNotFoundException {
+                            boolean all,
+                            boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException {
     Configuration conf = new Configuration();
     conf.set("xmlinput.start", "<page>");
     conf.set("xmlinput.end", "</page>");
     conf.setBoolean("exact.match.only", exactMatchOnly);
     conf.setBoolean("all.files", all);
+    conf.setBoolean("remove.labels", removeLabels);
     conf.set("io.serializations",
              "org.apache.hadoop.io.serializer.JavaSerialization,"
              + "org.apache.hadoop.io.serializer.WritableSerialization");

http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
index 94cb12e..d880760 100644
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
+++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
@@ -59,6 +59,8 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
 
   private boolean all;
 
+  private boolean removeLabels;
+
   @Override
   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
 
@@ -76,16 +78,23 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
       return;
     }
 
+    String catMatch = findMatchingCategory(document);
     if (!all) {
-      String catMatch = findMatchingCategory(document);
       if ("Unknown".equals(catMatch)) {
         return;
       }
     }
-    String catMatch = findMatchingCategory(document);
+
     document = StringEscapeUtils.unescapeHtml4(document);    
-    // write out in Bayes input style: key: /Category/document_name
+    if (removeLabels) {
+      document = removeCategoriesFromText(document);
+      // Reject documents with malformed tags
+      if (document == null) {
+        return;
+      }
+    }
 
+    // write out in Bayes input style: key: /Category/document_name
     String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
         SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
 
@@ -104,9 +113,10 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
     String categoriesStr = conf.get("wikipedia.categories");
     inputCategories = setStringifier.fromString(categoriesStr);
     exactMatchOnly = conf.getBoolean("exact.match.only", false);
-    all = conf.getBoolean("all.files", true);
-    log.info("Configure: Input Categories size: {} All: {} Exact Match: {}",
-             inputCategories.size(), all, exactMatchOnly);
+    all = conf.getBoolean("all.files", false);
+    removeLabels = conf.getBoolean("remove.labels",false);
+    log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}",
+            inputCategories.size(), all, exactMatchOnly, removeLabels);
   }
 
   private static String getDocument(String xml) {
@@ -144,4 +154,26 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
     }
     return "Unknown";
   }
+
+  private String removeCategoriesFromText(String document) {
+    int startIndex = 0;
+    int categoryIndex;
+    try {
+      while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+        int endIndex = document.indexOf("]]", categoryIndex);
+        if (endIndex >= document.length() || endIndex < 0) {
+          break;
+        }
+        document = document.replace(document.substring(categoryIndex, endIndex + 2), "");
+        if (categoryIndex < document.length()) {
+          startIndex = categoryIndex;
+        } else {
+          break;
+        }
+      }
+    } catch(StringIndexOutOfBoundsException e) {
+      return null;
+    }
+    return document;
+  }
 }