You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by ap...@apache.org on 2014/08/29 02:14:47 UTC
git commit: MAHOUT-1608: Add option in WikipediaToSequenceFile to
remove category labels from documents. Remove redundant call to
findMatchingCategory. closes apache/mahout#45
Repository: mahout
Updated Branches:
refs/heads/master 03a5bb61e -> 6dd0c92dd
MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents. Remove redundant call to findMatchingCategory. closes apache/mahout#45
Project: http://git-wip-us.apache.org/repos/asf/mahout/repo
Commit: http://git-wip-us.apache.org/repos/asf/mahout/commit/6dd0c92d
Tree: http://git-wip-us.apache.org/repos/asf/mahout/tree/6dd0c92d
Diff: http://git-wip-us.apache.org/repos/asf/mahout/diff/6dd0c92d
Branch: refs/heads/master
Commit: 6dd0c92ddb9dad2627e9ca5e28118865d6fba159
Parents: 03a5bb6
Author: Andrew Palumbo <ap...@outlook.com>
Authored: Thu Aug 28 20:06:13 2014 -0400
Committer: Andrew Palumbo <ap...@outlook.com>
Committed: Thu Aug 28 20:12:22 2014 -0400
----------------------------------------------------------------------
CHANGELOG | 2 +
.../mahout/text/WikipediaToSequenceFile.java | 23 ++++++++--
.../mahout/text/wikipedia/WikipediaMapper.java | 44 +++++++++++++++++---
3 files changed, 59 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/CHANGELOG
----------------------------------------------------------------------
diff --git a/CHANGELOG b/CHANGELOG
index dfccd95..310b5f0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,8 @@ Mahout Change Log
Release 1.0 - unreleased
+ MAHOUT-1608: Add option in WikipediaToSequenceFile to remove category labels from documents (apalumbo)
+
MAHOUT-1604: Spark version of rowsimilarity driver and associated additions to SimilarityAnalysis.scala (pferrel)
MAHOUT-1500: H2O Integration (Anand Avati via apalumbo)
http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
index 7a1e40e..19f353c 100644
--- a/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
+++ b/integration/src/main/java/org/apache/mahout/text/WikipediaToSequenceFile.java
@@ -88,12 +88,16 @@ public final class WikipediaToSequenceFile {
Option allOpt = obuilder.withLongName("all")
.withDescription("If set, Select all files. Default is false").withShortName("all").create();
-
+
+ Option removeLabelOpt = obuilder.withLongName("removeLabels")
+ .withDescription("If set, remove [[Category:labels]] from document text after extracting label."
+ + "Default is false").withShortName("rl").create();
+
Option helpOpt = DefaultOptionCreator.helpOption();
Group group = gbuilder.withName("Options").withOption(categoriesOpt).withOption(dirInputPathOpt)
.withOption(dirOutputPathOpt).withOption(exactMatchOpt).withOption(allOpt).withOption(helpOpt)
- .create();
+ .withOption(removeLabelOpt).create();
Parser parser = new Parser();
parser.setGroup(group);
@@ -117,7 +121,13 @@ public final class WikipediaToSequenceFile {
if (cmdLine.hasOption(allOpt)) {
all = true;
}
- runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all);
+
+ boolean removeLabels = false;
+ if (cmdLine.hasOption(removeLabelOpt)) {
+ removeLabels = true;
+ }
+
+ runJob(inputPath, outputPath, catFile, cmdLine.hasOption(exactMatchOpt), all, removeLabels);
} catch (OptionException e) {
log.error("Exception", e);
CommandLineUtil.printHelp(group);
@@ -144,17 +154,22 @@ public final class WikipediaToSequenceFile {
* category string
* @param all
* if true select all categories
+ * @param removeLabels
+ * if true remove Category labels from document text after extracting.
+ *
*/
public static void runJob(String input,
String output,
String catFile,
boolean exactMatchOnly,
- boolean all) throws IOException, InterruptedException, ClassNotFoundException {
+ boolean all,
+ boolean removeLabels) throws IOException, InterruptedException, ClassNotFoundException {
Configuration conf = new Configuration();
conf.set("xmlinput.start", "<page>");
conf.set("xmlinput.end", "</page>");
conf.setBoolean("exact.match.only", exactMatchOnly);
conf.setBoolean("all.files", all);
+ conf.setBoolean("remove.labels", removeLabels);
conf.set("io.serializations",
"org.apache.hadoop.io.serializer.JavaSerialization,"
+ "org.apache.hadoop.io.serializer.WritableSerialization");
http://git-wip-us.apache.org/repos/asf/mahout/blob/6dd0c92d/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
index 94cb12e..d880760 100644
--- a/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
+++ b/integration/src/main/java/org/apache/mahout/text/wikipedia/WikipediaMapper.java
@@ -59,6 +59,8 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
private boolean all;
+ private boolean removeLabels;
+
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
@@ -76,16 +78,23 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
return;
}
+ String catMatch = findMatchingCategory(document);
if (!all) {
- String catMatch = findMatchingCategory(document);
if ("Unknown".equals(catMatch)) {
return;
}
}
- String catMatch = findMatchingCategory(document);
+
document = StringEscapeUtils.unescapeHtml4(document);
- // write out in Bayes input style: key: /Category/document_name
+ if (removeLabels) {
+ document = removeCategoriesFromText(document);
+ // Reject documents with malformed tags
+ if (document == null) {
+ return;
+ }
+ }
+ // write out in Bayes input style: key: /Category/document_name
String category = "/" + catMatch.toLowerCase(Locale.ENGLISH) + "/" +
SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_");
@@ -104,9 +113,10 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
String categoriesStr = conf.get("wikipedia.categories");
inputCategories = setStringifier.fromString(categoriesStr);
exactMatchOnly = conf.getBoolean("exact.match.only", false);
- all = conf.getBoolean("all.files", true);
- log.info("Configure: Input Categories size: {} All: {} Exact Match: {}",
- inputCategories.size(), all, exactMatchOnly);
+ all = conf.getBoolean("all.files", false);
+ removeLabels = conf.getBoolean("remove.labels",false);
+ log.info("Configure: Input Categories size: {} All: {} Exact Match: {} Remove Labels from Text: {}",
+ inputCategories.size(), all, exactMatchOnly, removeLabels);
}
private static String getDocument(String xml) {
@@ -144,4 +154,26 @@ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> {
}
return "Unknown";
}
+
+ private String removeCategoriesFromText(String document) {
+ int startIndex = 0;
+ int categoryIndex;
+ try {
+ while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) {
+ int endIndex = document.indexOf("]]", categoryIndex);
+ if (endIndex >= document.length() || endIndex < 0) {
+ break;
+ }
+ document = document.replace(document.substring(categoryIndex, endIndex + 2), "");
+ if (categoryIndex < document.length()) {
+ startIndex = categoryIndex;
+ } else {
+ break;
+ }
+ }
+ } catch(StringIndexOutOfBoundsException e) {
+ return null;
+ }
+ return document;
+ }
}