You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/07/16 02:59:42 UTC

svn commit: r1691298 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/index-replace/ src/plugin/index-replace/sample/ src/plugin/index-replace/src/ src/plugin/index-replace/src/java/ src/plugin/index-replace/src/java/org/ src/plugin/index-replace/src...

Author: mattmann
Date: Thu Jul 16 00:59:41 2015
New Revision: 1691298

URL: http://svn.apache.org/r1691298
Log:
Fix for NUTCH-2058: Indexer plugin that allows RegEx replacements on the NutchDocument field values contributed by PeterCiuffetti <pc...@astreetpress.com> this closes #44

Added:
    nutch/trunk/src/plugin/index-replace/
    nutch/trunk/src/plugin/index-replace/README.txt
    nutch/trunk/src/plugin/index-replace/build.xml
    nutch/trunk/src/plugin/index-replace/ivy.xml
    nutch/trunk/src/plugin/index-replace/plugin.xml
    nutch/trunk/src/plugin/index-replace/sample/
    nutch/trunk/src/plugin/index-replace/sample/testIndexReplace.html
    nutch/trunk/src/plugin/index-replace/src/
    nutch/trunk/src/plugin/index-replace/src/java/
    nutch/trunk/src/plugin/index-replace/src/java/org/
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
    nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java
    nutch/trunk/src/plugin/index-replace/src/test/
    nutch/trunk/src/plugin/index-replace/src/test/org/
    nutch/trunk/src/plugin/index-replace/src/test/org/apache/
    nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/
    nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/
    nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
    nutch/trunk/src/plugin/parse-replace/
    nutch/trunk/src/plugin/parse-replace/README.txt
    nutch/trunk/src/plugin/parse-replace/build.xml
    nutch/trunk/src/plugin/parse-replace/ivy.xml
    nutch/trunk/src/plugin/parse-replace/plugin.xml
    nutch/trunk/src/plugin/parse-replace/sample/
    nutch/trunk/src/plugin/parse-replace/sample/testParseReplace.html
    nutch/trunk/src/plugin/parse-replace/src/
    nutch/trunk/src/plugin/parse-replace/src/java/
    nutch/trunk/src/plugin/parse-replace/src/java/org/
    nutch/trunk/src/plugin/parse-replace/src/java/org/apache/
    nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/
    nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/
    nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
    nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
    nutch/trunk/src/plugin/parse-replace/src/test/
    nutch/trunk/src/plugin/parse-replace/src/test/org/
    nutch/trunk/src/plugin/parse-replace/src/test/org/apache/
    nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/
    nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/
    nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/build.xml
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1691298&r1=1691297&r2=1691298&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Jul 16 00:59:41 2015
@@ -2,6 +2,9 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-2058 Indexer plugin that allows RegEx replacements on the NutchDocument 
+  field values (Peter Ciuffetti via mattmann)
+
 * NUTCH-2059 protocol-httpclient, protocol-http unit test errors on Jenkins (Peter Ciuffetti via mattmann)
 
 * NUTCH-1980 Jexl expressions for CrawlDbReader (markus)
@@ -83,8 +86,6 @@ Release Report: http://s.apache.org/nutc
 
 * NUTCH-1987 - Make bin/crawl indexer agnostic (Michael Joyce, snagel via mattmann)
  
-* NUTCH-1854 bin/crawl fails with a parsing fetcher (Asitang Mishra via snagel)
-
 * NUTCH-1989 Handling invalid URLs in CommonCrawlDataDumper (Giuseppe Totaro via mattmann)
 
 * NUTCH-1988 Make nested output directory dump optional (Michael Joyce via mattmann)

Modified: nutch/trunk/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1691298&r1=1691297&r2=1691298&view=diff
==============================================================================
--- nutch/trunk/build.xml (original)
+++ nutch/trunk/build.xml Thu Jul 16 00:59:41 2015
@@ -176,6 +176,7 @@
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
+      <packageset dir="${plugins.dir}/index-replace/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
       <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
@@ -588,6 +589,7 @@
       <packageset dir="${plugins.dir}/index-geoip/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
+      <packageset dir="${plugins.dir}/index-replace/src/java"/>
       <packageset dir="${plugins.dir}/index-static/src/java"/>
       <packageset dir="${plugins.dir}/mimetype-filter/src/java"/>
       <packageset dir="${plugins.dir}/indexer-dummy/src/java"/>
@@ -988,6 +990,8 @@
         <source path="${plugins.dir}/index-metadata/src/java/" />
         <source path="${plugins.dir}/index-more/src/java/" />
         <source path="${plugins.dir}/index-more/src/test/" />
+        <source path="${plugins.dir}/index-replace/src/java/" />
+        <source path="${plugins.dir}/index-replace/src/test/" />
         <source path="${plugins.dir}/index-static/src/java/" />
         <source path="${plugins.dir}/index-static/src/test/" />
         <source path="${plugins.dir}/language-identifier/src/java/" />

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1691298&r1=1691297&r2=1691298&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Thu Jul 16 00:59:41 2015
@@ -1501,6 +1501,23 @@ CAUTION: Set the parser.timeout to -1 or
   </description>
 </property>
 
+<property>
+  <name>index.replace.regexp</name>
+  <value/>
+  <description>Allows indexing-time regexp replace manipulation of metadata fields.
+    The format of the property is a list of regexp replacements, one line per field being
+    modified.  Include index-replace in your plugin.includes.
+
+    Example:
+        hostmatch=.*somedomain.com
+        fldname1=/regexp/replacement/flags
+        fldname2=/regexp/replacement/flags
+
+    Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+    See https://wiki.apache.org/nutch/IndexReplace for further details.
+  </description>
+</property>
+
 <!-- parse-metatags plugin properties -->
 <property>
   <name>metatags.names</name>

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1691298&r1=1691297&r2=1691298&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Thu Jul 16 00:59:41 2015
@@ -33,6 +33,7 @@
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-geoip" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
+     <ant dir="index-replace" target="deploy"/>
      <ant dir="index-static" target="deploy"/>
      <ant dir="index-metadata" target="deploy"/>
      <ant dir="mimetype-filter" target="deploy"/>
@@ -94,6 +95,7 @@
      <ant dir="index-geoip" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="index-static" target="test"/>
+     <ant dir="index-replace" target="test"/>
      <ant dir="mimetype-filter" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="lib-http" target="test"/>
@@ -137,6 +139,7 @@
     <ant dir="index-geoip" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="index-static" target="clean"/>
+    <ant dir="index-replace" target="clean"/>
     <ant dir="index-metadata" target="clean"/>
     <ant dir="mimetype-filter" target="clean"/>
     <ant dir="indexer-dummy" target="clean"/>

Added: nutch/trunk/src/plugin/index-replace/README.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/README.txt?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/README.txt (added)
+++ nutch/trunk/src/plugin/index-replace/README.txt Thu Jul 16 00:59:41 2015
@@ -0,0 +1,95 @@
+IndexReplace plugin
+
+Allows indexing-time regexp replace manipulation of metadata fields.
+
+Configuration Example
+    <property>
+      <name>index.replace.regexp</name>
+      <value>
+        id=/file\:/http\:my.site.com/
+        url=/file\:/http\:my.site.com/2
+      </value>
+    </property
+
+Property format: index.replace.regexp
+    The format of the property is a list of regexp replacements, one line per field being
+    modified.  Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+
+    The fieldname precedes the equal sign.  The first character after the equal sign signifies
+    the delimiter for the regexp, the replacement value and the flags.
+
+Replacement Sequence
+    The replacements will happen in the order listed. If a field needs multiple replacement operations
+    they may be listed more than once.
+
+RegExp Format
+    The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
+    here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
+    Patterns are compiled when the plugin is initialized for efficiency.
+
+Replacement Format
+    The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
+    http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
+
+Flags
+    The flags is an integer sum of the flag values defined in
+    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
+
+Creating New Fields
+    If you express the fieldname as fldname1:fldname2=[replacement], then the replacer will create a new field
+    from the source field.  The source field remains unmodified.  This is an alternative to solrindex-mapping
+    which is only able to copy fields verbatim.
+
+Multi-valued Fields
+    If a field has multiple values, the replacement will be applied to each value in turn.
+
+Non-string Datatypes
+    Replacement is possible only on String field datatypes.  If the field you name in the property is
+    not a String datatype, it will be silently ignored.
+
+Host and URL specific replacements.
+    If the replacements should apply only to specific pages, then add a sequence like
+
+    hostmatch=hostmatchpattern
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+    or
+    urlmatch=urlmatchpattern
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
+will apply to all parsed pages.  Replacements following a hostmatch or urlmatch will be applied
+to pages which match the host or url field (up to the next hostmatch or urlmatch line).  hostmatch
+and urlmatch patterns must be unique in this property.
+
+Plugin order
+    In most cases you will want this plugin to run last.
+
+Testing your match patterns
+    Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
+    can help get the basics of your pattern working.
+    To test in nutch: 
+        Prepare a test HTML file with the field contents you want to test. 
+        Place this in a directory accessible to nutch.
+        Use the file:/// syntax to list the test file(s) in a test/urls seed list.
+        See the nutch faq "index my local file system" for conf settings you will need.
+        (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
+        test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
+        patterns also match the file: URL pattern)
+ 
+    Run..
+        bin/nutch inject crawl/crawldb test
+        bin/nutch generate crawl/crawldb crawl/segments
+        bin/nutch fetch crawl/segments/[segment]
+        bin/nutch parse crawl/segments/[segment]
+        bin/nutch invertlinks crawl/linkdb -dir crawl/segments
+        ...index your document, for example with SOLR...
+        bin/nutch solrindex http://localhost:8983/solr crawl/crawldb/ -linkdb crawl/linkdb/ crawl/segement[segment] -filter -normalize
+
+    Inspect hadoop.log for info about pattern parsing and compilation..
+        grep replace logs/hadoop.log
+
+    To inspect your index with the solr admin panel...
+        http://localhost:8983/solr/#/

Added: nutch/trunk/src/plugin/index-replace/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/build.xml?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/build.xml (added)
+++ nutch/trunk/src/plugin/index-replace/build.xml Thu Jul 16 00:59:41 2015
@@ -0,0 +1,55 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-replace" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Add compilation dependencies to classpath -->
+	<path id="plugin.deps">
+		<fileset dir="${nutch.root}/build">
+			<include name="**/index-basic/*.jar" />
+			<include name="**/index-metadata/*.jar" />
+		</fileset>
+		<pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+	</path>
+
+	<!-- Compile Unit test dependencies -->
+	<target name="deps-test-compile">
+		<ant target="compile-test" inheritall="false" dir="../index-basic"/>
+		<ant target="compile-test" inheritall="false" dir="../index-metadata"/>
+	</target>
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+		<ant target="deploy" inheritall="false" dir="../parse-html" />
+		<ant target="deploy" inheritall="false" dir="../parse-metatags" />
+		<ant target="deploy" inheritall="false" dir="../index-basic" />
+		<ant target="deploy" inheritall="false" dir="../index-metadata" />
+	</target>
+
+	<!-- Copy test file for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

Added: nutch/trunk/src/plugin/index-replace/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/ivy.xml?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/ivy.xml (added)
+++ nutch/trunk/src/plugin/index-replace/ivy.xml Thu Jul 16 00:59:41 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/index-replace/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/plugin.xml?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/plugin.xml (added)
+++ nutch/trunk/src/plugin/index-replace/plugin.xml Thu Jul 16 00:59:41 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="index-replace"
+   name="Replace Indexer"
+   version="1.0"
+   provider-name="PeterCiuffetti">
+
+   <runtime>
+      <library name="index-replace.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.indexer.replace"
+              name="Replace Indexer"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="ReplaceIndexer"
+                      class="org.apache.nutch.indexer.replace.ReplaceIndexer"/>
+   </extension>
+
+</plugin>
+

Added: nutch/trunk/src/plugin/index-replace/sample/testIndexReplace.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/sample/testIndexReplace.html?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/sample/testIndexReplace.html (added)
+++ nutch/trunk/src/plugin/index-replace/sample/testIndexReplace.html Thu Jul 16 00:59:41 2015
@@ -0,0 +1,12 @@
+<html>
+  <head>
+    <title>Testing the power of the index-replace plugin</title>
+    <meta name="description" content="With this plugin, I control the description! Bwuhuhuhaha!">
+    <meta name="keywords" content="Breathtaking, Riveting, Two Thumbs Up!">
+    <meta name="author" content="Peter Ciuffetti">
+  </head>
+  <body>
+    <p>This html file is used to test the Nutch index-replace regexp replacer plugin.
+    A decidedly boring thing to do.</p>
+  </body>
+</html>
\ No newline at end of file

Added: nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java (added)
+++ nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,196 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.replace;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+/**
+ * POJO to store a filename, its match pattern and its replacement string.
+ *
+ * A checkAndReplace method is provided where you can simultaneously check if
+ * the field matches this replacer and if the pattern matches your field value.
+ *
+ * @author Peter Ciuffetti
+ */
+public class FieldReplacer {
+
+  private static final Log LOG = LogFactory.getLog(FieldReplacer.class
+      .getName());
+
+  private final String fieldName;
+  private final String toFieldName;
+  private final Pattern pattern;
+  private final String replacement;
+  private boolean isValid;
+
+  /**
+   * Create a FieldReplacer for a field.
+   *
+   * Any pattern exceptions are caught within this constructor and the object is
+   * marked inValid. The error will be logged. This prevents this caller from
+   * attempting invalid replacements.
+   *
+   * @param fieldName
+   *          the name of the source field to operate on. Required.
+   * @param toFieldName
+   *          the name of the target field. Required.
+   * @param pattern
+   *          the pattern the field must match. Required.
+   * @param replacement
+   *          the replacement string
+   * @param flags
+   *          the Pattern flags value, or null if no flags are needed
+   */
+  public FieldReplacer(String fieldName, String toFieldName, String pattern,
+      String replacement, Integer flags) {
+
+    this.isValid = true;
+    // Must have a non-empty field name and pattern.
+    if (fieldName == null || fieldName.trim().length() == 0) {
+      LOG.error("Empty fieldName provided, FieldReplacer marked invalid.");
+      this.isValid = false;
+    }
+    if (pattern == null || pattern.trim().length() == 0) {
+      LOG.error("Empty pattern for field " + fieldName
+          + "provided, FieldReplacer marked invalid.");
+      this.isValid = false;
+    }
+
+    if (replacement == null) {
+      this.replacement = "";
+    } else {
+      this.replacement = replacement;
+    }
+
+    this.fieldName = fieldName.trim();
+    this.toFieldName = toFieldName.trim();
+
+    if (this.isValid) {
+      LOG.info("Compiling pattern " + pattern + " for field " + fieldName);
+      Pattern myPattern = null;
+      try {
+        if (flags != null) {
+          myPattern = Pattern.compile(pattern, flags);
+        } else {
+          myPattern = Pattern.compile(pattern);
+        }
+      } catch (PatternSyntaxException e) {
+        LOG.error("Pattern " + pattern + " for field " + fieldName
+            + " failed to compile: " + e.toString());
+        this.isValid = false;
+      }
+      this.pattern = myPattern;
+    } else {
+      this.pattern = null;
+    }
+  }
+
+  /**
+   * Field replacer with the input and output field the same.
+   *
+   * @param fieldName
+   * @param pattern
+   * @param replacement
+   * @param flags
+   */
+  public FieldReplacer(String fieldName, String pattern, String replacement,
+      Integer flags) {
+    this(fieldName, fieldName, pattern, replacement, flags);
+  }
+
+  public String getFieldName() {
+    return this.fieldName;
+  }
+
+  public String getToFieldName() {
+    return this.toFieldName;
+  }
+
+  public Pattern getPattern() {
+    return this.pattern;
+  }
+
+  public String getReplacement() {
+    return this.replacement;
+  }
+
+  /**
+   * Does this FieldReplacer have a valid fieldname and pattern?
+   *
+   * @return
+   */
+  public boolean isValid() {
+    return this.isValid;
+  }
+
+  /**
+   * Return the replacement value for a field value.
+   *
+   * This does not check for a matching field; the caller must decide if this
+   * FieldReplacer should operate on this value by checking getFieldName().
+   *
+   * The method returns the value with the replacement. If the value returned is
+   * not different then eiher the pattern didn't match or the replacement was a
+   * no-op.
+   *
+   * @param value
+   * @return
+   */
+  public String replace(String value) {
+    if (this.isValid) {
+      return this.pattern.matcher(value).replaceAll(replacement);
+    } else {
+      return value;
+    }
+  }
+
+  /**
+   * Return a replacement value for a field.
+   *
+   * This is designed to fail fast and trigger a replacement only when
+   * necessary. If this method returns null, either the field does not match or
+   * the value does not match the pattern (or possibly the pattern is invalid).
+   *
+   * So only if the method returns a non-null value will you need to replace the
+   * value for the field.
+   *
+   * @param fieldName
+   *          the name of the field you are checking
+   * @param value
+   *          the value of the field you are checking
+   * @return a replacement value. If null, either the field does not match or
+   *         the value does not match.
+   */
+  public String checkAndReplace(String fieldName, String value) {
+    if (this.fieldName.equals(fieldName)) {
+      if (value != null && value.length() > 0) {
+        if (this.isValid) {
+          Matcher m = this.pattern.matcher(value);
+          if (m.find()) {
+            return m.replaceAll(this.replacement);
+          }
+        }
+      }
+    }
+    return null;
+  }
+}

Added: nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java (added)
+++ nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,330 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.replace;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Do pattern replacements on selected field contents prior to indexing.
+ * 
+ * To use this plugin, add <code>index-replace</code> to your
+ * <code>plugin.includes</code>. Example:
+ * 
+ * <pre>
+ *   &lt;property>
+ *    &lt;name>plugin.includes&lt;/name>
+ *    &lt;value>protocol-(http)|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata|replace)|urlnormalizer-(pass|regex|basic)|indexer-solr&lt;/value>
+ *   &lt;/property>
+ * </pre>
+ *
+ * And then add the <code>index.replace.regexp</code> property to
+ * <code>conf/nutch-site.xml</code>. This contains a list of replacement
+ * instructions per field name, one per line. eg.
+ * 
+ * <pre>
+ *   fieldname=/regexp/replacement/[flags]
+ * </pre>
+ * 
+ * <pre>
+ *   &lt;property>
+ *    &lt;name>index.replace.regexp&lt;/name>
+ *    &lt;value>
+ *      hostmatch=.*\\.com
+ *      title=/search/replace/2
+ *    &lt;/value>
+ *   &lt;/property>
+ * </pre>
+ * 
+ * <code>hostmatch=</code> and <code>urlmatch=</code> lines indicate the match
+ * pattern for a host or url. The field replacements that follow this line will
+ * apply only to pages from the matching host or url. Replacements run in the
+ * order specified. Field names may appear multiple times if multiple
+ * replacements are needed.
+ * 
+ * The property format is defined in greater detail in
+ * <code>conf/nutch-default.xml</code>.
+ *
+ * @author Peter Ciuffetti
+ * @see <a
+ *      href="https://issues.apache.org/jira/browse/NUTCH-2058">NUTCH-2058</a>
+ */
+public class ReplaceIndexer implements IndexingFilter {
+
+  private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class
+      .getName());
+
+  /** Special field name signifying the start of a host-specific match set */
+  private static final String HOSTMATCH = "hostmatch";
+  /** Special field name signifying the start of a url-specific match set */
+  private static final String URLMATCH = "urlmatch";
+
+  private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_HOST = new LinkedHashMap<Pattern, List<FieldReplacer>>();
+  private static Map<Pattern, List<FieldReplacer>> FIELDREPLACERS_BY_URL = new LinkedHashMap<Pattern, List<FieldReplacer>>();
+
+  private static Pattern LINE_SPLIT = Pattern.compile("(^.+$)+",
+      Pattern.MULTILINE);
+  private static Pattern NAME_VALUE_SPLIT = Pattern.compile("(.*?)=(.*)");
+
+  private Configuration conf;
+
+  /**
+   * {@inheritDoc}
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    FIELDREPLACERS_BY_HOST.clear();
+    FIELDREPLACERS_BY_URL.clear();
+    String value = conf.get("index.replace.regexp", null);
+    if (value != null) {
+      LOG.debug("Parsing index.replace.regexp property");
+      this.parseConf(value);
+    }
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Parse the property value into a set of maps that store a list of
+   * replacements by field for each host and url configured into the property.
+   * 
+   * @param propertyValue
+   */
+  private void parseConf(String propertyValue) {
+    if (propertyValue == null || propertyValue.trim().length() == 0) {
+      return;
+    }
+
+    // At the start, all replacements apply globally to every host.
+    Pattern hostPattern = Pattern.compile(".*");
+    Pattern urlPattern = null;
+
+    // Split the property into lines
+    Matcher lineMatcher = LINE_SPLIT.matcher(propertyValue);
+    while (lineMatcher.find()) {
+      String line = lineMatcher.group();
+      if (line != null && line.length() > 0) {
+
+        // Split the line into field and value
+        Matcher nameValueMatcher = NAME_VALUE_SPLIT.matcher(line.trim());
+        if (nameValueMatcher.find()) {
+          String fieldName = nameValueMatcher.group(1).trim();
+          String value = nameValueMatcher.group(2);
+          if (fieldName != null && value != null) {
+            // Check if the field name is one of our special cases.
+            if (HOSTMATCH.equals(fieldName)) {
+              urlPattern = null;
+              try {
+                hostPattern = Pattern.compile(value);
+              } catch (PatternSyntaxException pse) {
+                LOG.error("hostmatch pattern " + value + " does not compile: "
+                    + pse.getMessage());
+                // Deactivate this invalid match set by making it match no host.
+                hostPattern = Pattern.compile("willnotmatchanyhost");
+              }
+            } else if (URLMATCH.equals(fieldName)) {
+              try {
+                urlPattern = Pattern.compile(value);
+              } catch (PatternSyntaxException pse) {
+                LOG.error("urlmatch pattern " + value + " does not compile: "
+                    + pse.getMessage());
+                // Deactivate this invalid match set by making it match no url.
+                urlPattern = Pattern.compile("willnotmatchanyurl");
+              }
+            } else if (value.length() > 3) {
+              String toFieldName = fieldName;
+              // If the fieldname has a colon, this indicates a different target
+              // field.
+              if (fieldName.indexOf(':') > 0) {
+                toFieldName = fieldName.substring(fieldName.indexOf(':') + 1);
+                fieldName = fieldName.substring(0, fieldName.indexOf(':'));
+              }
+              String sep = value.substring(0, 1);
+
+              // Divide the value into pattern / replacement / flags.
+              value = value.substring(1);
+              if (!value.contains(sep)) {
+                LOG.error("Pattern '" + line
+                    + "', not parseable.  Missing separator " + sep);
+                continue;
+              }
+              String pattern = value.substring(0, value.indexOf(sep));
+              value = value.substring(pattern.length() + 1);
+              String replacement = value;
+              if (value.contains(sep)) {
+                replacement = value.substring(0, value.indexOf(sep));
+              }
+              int flags = 0;
+              if (value.length() > replacement.length() + 1) {
+                value = value.substring(replacement.length() + 1).trim();
+                try {
+                  flags = Integer.parseInt(value);
+                } catch (NumberFormatException e) {
+                  LOG.error("Pattern " + line + ", has invalid flags component");
+                  continue;
+                }
+              }
+              Integer iFlags = (flags > 0) ? new Integer(flags) : null;
+
+              // Make a FieldReplacer out of these params.
+              FieldReplacer fr = new FieldReplacer(fieldName, toFieldName,
+                  pattern, replacement, iFlags);
+
+              // Add this field replacer to the list for this host or URL.
+              if (urlPattern != null) {
+                List<FieldReplacer> lfp = FIELDREPLACERS_BY_URL.get(urlPattern);
+                if (lfp == null) {
+                  lfp = new ArrayList<FieldReplacer>();
+                }
+                lfp.add(fr);
+                FIELDREPLACERS_BY_URL.put(urlPattern, lfp);
+              } else {
+                List<FieldReplacer> lfp = FIELDREPLACERS_BY_HOST
+                    .get(hostPattern);
+                if (lfp == null) {
+                  lfp = new ArrayList<FieldReplacer>();
+                }
+                lfp.add(fr);
+                FIELDREPLACERS_BY_HOST.put(hostPattern, lfp);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * {@inheritDoc}
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    if (doc != null) {
+      if (FIELDREPLACERS_BY_HOST.size() > 0) {
+        this.doReplace(doc, "host", FIELDREPLACERS_BY_HOST);
+      }
+
+      if (FIELDREPLACERS_BY_URL.size() > 0) {
+        this.doReplace(doc, "url", FIELDREPLACERS_BY_URL);
+      }
+    }
+
+    return doc;
+  }
+
+  /**
+   * Iterates through the replacement map provided, to update the fields in the
+   * Nutch Document.
+   * 
+   * @param doc
+   *          the document we are modifying
+   * @param keyName
+   *          either "host" or "url" -- the field that determines the
+   *          replacement set used
+   * @param replaceMap
+   *          the list of FieldReplacers that applies to this keyName.
+   */
+  private void doReplace(NutchDocument doc, String keyName,
+      Map<Pattern, List<FieldReplacer>> replaceMap) {
+
+    if (doc == null || replaceMap.size() == 0) {
+      return;
+    }
+
+    Collection<String> docFieldNames = doc.getFieldNames();
+    NutchField keyField = doc.getField(keyName);
+    if (keyField == null) {
+      // This document doesn't have the key field; no work to do.
+      return;
+    }
+
+    List<Object> keyFieldValues = keyField.getValues();
+    if (keyFieldValues.size() == 0) {
+      // This document doesn't have any values for the key field; no work to do.
+      return;
+    }
+
+    // For every value of the keyField (one expected)
+    for (Object oKeyFieldValue : keyFieldValues) {
+      if (oKeyFieldValue != null && oKeyFieldValue instanceof java.lang.String) {
+        String keyFieldValue = (String) oKeyFieldValue;
+
+        // For each pattern that we have a replacement list for...
+        for (Map.Entry<Pattern, List<FieldReplacer>> entries : replaceMap
+            .entrySet()) {
+          // If this key is a match for a replacement set...
+          if (entries.getKey().matcher(keyFieldValue).find()) {
+
+            // For each field we will replace for this key...
+            for (FieldReplacer fp : entries.getValue()) {
+              String fieldName = fp.getFieldName();
+
+              // Does this document contain the FieldReplacer's field?
+              if (docFieldNames.contains(fieldName)) {
+                NutchField docField = doc.getField(fieldName);
+                List<Object> fieldValues = docField.getValues();
+                ArrayList<String> newFieldValues = new ArrayList<String>();
+
+                // For each value of the field, match against our
+                // replacer...
+                for (Object oFieldValue : fieldValues) {
+                  if (oFieldValue != null
+                      && oFieldValue instanceof java.lang.String) {
+                    String fieldValue = (String) oFieldValue;
+                    String newValue = fp.replace(fieldValue);
+                    newFieldValues.add(newValue);
+                  }
+                }
+
+                // Remove the target field and add our replaced values.
+                String targetFieldName = fp.getToFieldName();
+                doc.removeField(targetFieldName);
+                for (String newFieldValue : newFieldValues) {
+                  doc.add(targetFieldName, newFieldValue);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}

Added: nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java (added)
+++ nutch/trunk/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/package-info.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to allow pattern replacements on metadata.
+ */
+package org.apache.nutch.indexer.replace;
+

Added: nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java (added)
+++ nutch/trunk/src/plugin/index-replace/src/test/org/apache/nutch/indexer/replace/TestIndexReplace.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,456 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.indexer.metadata.MetadataIndexer;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit tests for the <code>index-replace</code> plugin.
+ * 
+ * In these tests, the sample file has some meta tags added to the Nutch
+ * document by the <code>index-metadata</code> plugin. The
+ * <code>index-replace</code> plugin is then used to either change (or not
+ * change) the fields depending on the various values of
+ * <code>index.replace.regexp</code> property being provided to Nutch.
+ * 
+ * 
+ * @author Peter Ciuffetti
+ *
+ */
+public class TestIndexReplace {
+
+  private static final String INDEX_REPLACE_PROPERTY = "index.replace.regexp";
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testIndexReplace.html";
+
+  /**
+   * Run a test file through the Nutch parser and index filters.
+   * 
+   * @param fileName
+   * @param conf
+   * @return the Nutch document with the replace indexer applied
+   */
+  public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
+    NutchDocument doc = new NutchDocument();
+
+    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
+    basicIndexer.setConf(conf);
+    Assert.assertNotNull(basicIndexer);
+
+    MetadataIndexer metaIndexer = new MetadataIndexer();
+    metaIndexer.setConf(conf);
+    Assert.assertNotNull(basicIndexer);
+
+    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
+    replaceIndexer.setConf(conf);
+    Assert.assertNotNull(replaceIndexer);
+
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Text text = new Text(urlString);
+      CrawlDatum crawlDatum = new CrawlDatum();
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(text, crawlDatum)
+          .getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      crawlDatum.setFetchTime(100L);
+
+      Inlinks inlinks = new Inlinks();
+      doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+      doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+      doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+
+    return doc;
+  }
+
+  /**
+   * Test property parsing.
+   * 
+   * The filter does not expose details of the parse. So all we are checking is
+   * that the parse does not throw a runtime exception and that the value
+   * provided is the value returned.
+   */
+  @Test
+  public void testPropertyParse() {
+    Configuration conf = NutchConfiguration.create();
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this awesome plugin/2\n"
+        + "  metatag.keywords=/\\,/\\!/\n"
+        + "  hostmatch=.*.com\n"
+        + "  metatag.keywords=/\\,/\\?/\n"
+        + "  metatag.author:dc_author=/\\s+/ David /\n"
+        + "  urlmatch=.*.html\n"
+        + "  metatag.keywords=/\\,/\\./\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+
+    ReplaceIndexer rp = new ReplaceIndexer();
+    try {
+      rp.setConf(conf);
+    } catch (RuntimeException ohno) {
+      Assert.fail("Unable to parse a valid index.replace.regexp property! "
+          + ohno.getMessage());
+    }
+
+    Configuration parsedConf = rp.getConf();
+
+    // Does the getter equal the setter? Too easy!
+    Assert.assertEquals(indexReplaceProperty,
+        parsedConf.get(INDEX_REPLACE_PROPERTY));
+  }
+
+  /**
+   * Test metatag value replacement using global replacement settings.
+   * 
+   * The index.replace.regexp property does not use hostmatch or urlmatch, so
+   * all patterns are global.
+   */
+  @Test
+  public void testGlobalReplacement() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+    String expectedAuthor = "Peter D. Ciuffetti";
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+  }
+
+  /**
+   * Test that invalid property settings are handled and ignored.
+   * 
+   * This test provides an invalid property setting that will fail property
+   * parsing and Pattern.compile. The expected outcome is that the patterns will
+   * not cause failure and the targeted fields will not be modified by the
+   * filter.
+   */
+  @Test
+  public void testInvalidPatterns() {
+    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+    String expectedAuthor = "Peter Ciuffetti";
+    // Contains: invalid pattern, invalid flags, incomplete property
+    String indexReplaceProperty = "  metatag.description=/this\\hplugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have not changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test URL pattern matching
+   */
+  @Test
+  public void testUrlMatchesPattern() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+    String expectedAuthor = "Peter D. Ciuffetti";
+    String indexReplaceProperty = " urlmatch=.*.html\n"
+        + "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test URL pattern not matching.
+   * 
+   * Expected result is that the filter does not change the fields.
+   */
+  @Test
+  public void testUrlNotMatchesPattern() {
+    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+    String expectedAuthor = "Peter Ciuffetti";
+    String indexReplaceProperty = " urlmatch=.*.xml\n"
+        + "  metatag.description=/this(.*)plugin/this awesome plugin/\n"
+        + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have not changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test a global pattern match for description and URL pattern match for
+   * keywords and author.
+   * 
+   * All three should be triggered. It also tests replacement groups.
+   */
+  @Test
+  public void testGlobalAndUrlMatchesPattern() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
+    String expectedAuthor = "Peter D. Ciuffetti";
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+        + "  urlmatch=.*.html\n"
+        + "  metatag.keywords=/\\,/\\!/\n"
+        + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that our metatags have changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+
+  }
+
+  /**
+   * Test a global pattern match for description and URL pattern match for
+   * keywords and author.
+   * 
+   * Only the global match should be triggered.
+   */
+  @Test
+  public void testGlobalAndUrlNotMatchesPattern() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
+    String expectedAuthor = "Peter Ciuffetti";
+    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n"
+        + "  urlmatch=.*.xml\n"
+        + "  metatag.keywords=/\\,/\\!/\n"
+        + "  metatag.author=/\\s+/ D. /\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Assert that description has changed and the others have not changed.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    Assert
+        .assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
+    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
+  }
+
+  /**
+   * Test order-specific replacement settings.
+   * 
+   * This makes multiple replacements on the same field and will produce the
+   * expected value only if the replacements are run in the order specified.
+   */
+  @Test
+  public void testReplacementsRunInSpecifedOrder() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String indexReplaceProperty = "  metatag.description=/this plugin/this amazing plugin/\n"
+        + "  metatag.description=/this amazing plugin/this valuable plugin/\n"
+        + "  metatag.description=/this valuable plugin/this cool plugin/\n"
+        + "  metatag.description=/this cool plugin/this wicked plugin/\n"
+        + "  metatag.description=/this wicked plugin/this awesome plugin/\n";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Check that the value produced by the last replacement has worked.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+  }
+
+  /**
+   * Test a replacement pattern that uses the flags feature.
+   * 
+   * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
+   * any case.
+   */
+  @Test
+  public void testReplacementsWithFlags() {
+    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String indexReplaceProperty = "  metatag.description=/THIS PLUGIN/this awesome plugin/2";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Check that the value produced by the case-insensitive replacement has
+    // worked.
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+  }
+
+  /**
+   * Test a replacement pattern that uses the target field feature.
+   * Check that the input is not modifid and that the taret field is added.
+   */
+  @Test
+  public void testReplacementsDifferentTarget() {
+    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
+    String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
+    String indexReplaceProperty = "  metatag.description:new=/this plugin/this awesome plugin/";
+
+    Configuration conf = NutchConfiguration.create();
+    conf.set(
+        "plugin.includes",
+        "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
+    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
+    conf.set("metatags.names", "author,description,keywords");
+    conf.set("index.parse.md",
+        "metatag.author,metatag.description,metatag.keywords");
+    // Not necessary but helpful when debugging the filter.
+    conf.set("http.timeout", "99999999999");
+
+    // Run the document through the parser and index filters.
+    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
+
+    // Check that the input field has not been modified
+    Assert.assertEquals(expectedDescription,
+        doc.getFieldValue("metatag.description"));
+    // Check that the output field has created
+    Assert.assertEquals(expectedTargetDescription,
+        doc.getFieldValue("new"));
+  }
+}

Added: nutch/trunk/src/plugin/parse-replace/README.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/README.txt?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/README.txt (added)
+++ nutch/trunk/src/plugin/parse-replace/README.txt Thu Jul 16 00:59:41 2015
@@ -0,0 +1,91 @@
+ParseReplace plugin
+
+Allows post-parsing regexp replace manipulation of metadata fields.
+
+Configuration Example
+    <property>
+      <name>parse.replace.regexp</name>
+      <value>
+        id=/file:/http:/
+        url=/file:/http:/128
+      </value>
+    </property
+
+Property format: parse.replace.regexp
+    The format of the property is a list of regexp replacements, one line per field being
+    modified.  Field names would be one of those from https://wiki.apache.org/nutch/IndexStructure.
+
+    The fieldname preceeds the equal sign.  The first character after the equal sign signifies
+    the delimiter for the regexp, the replacement value and the flags.
+
+Replacement Sequence
+    The replacements will happen in the order listed. If a field needs multiple replacement operations
+    they may be listed more than once.
+
+RegExp Format
+    The regexp and the optional flags should correspond to Pattern.compile(String regexp, int flags) defined
+    here: http://docs.oracle.com/javase/7/docs/api/java/util/regex/Pattern.html#compile%28java.lang.String,%20int%29
+    Patterns are compiled when the plugin is initialized for efficiency.
+
+Replacement Format
+    The replacement value should correspond to Java Matcher(CharSequence input).replaceAll(String replacement):
+    http://docs.oracle.com/javase/7/docs/api/java/util/regex/Matcher.html#replaceAll%28java.lang.String%29
+
+Flags
+    The flags is an integer sum of the flag values defined in
+    http://docs.oracle.com/javase/7/docs/api/constant-values.html (Sec: java.util.regex.Pattern)
+
+Escaping
+    Since the regexp is being read from a config file, any escaped values must be double
+    escaped.  Eg:  id=/\\s+//  will cause the esacped \s+ match pattern to be used.
+
+Multi-valued Fields
+    If a field has multiple values, the replacement will be applied to each value in turn.
+
+Non-string Datatypes
+    Replacement is possible only on String field datatypes.  If the field you name in the property is
+    not a String datatype, it will be silently ignored.
+
+Host and URL specifc replacements.
+    If the replacements should apply only to specifc pages, then add a sequence like
+
+    hostmatch=/host match pattern/
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+    or
+    urlmatch=/url match pattern/
+    fld1=/regexp/replace/flags
+    fld2=/regexp/replace/flags
+
+When using Host and URL replacements, all replacements preceding the first hostmatch or urlmatch
+will apply to all parsed pages.  Replacements following a hostmatch or urlmatch will be applied
+to pages which match the host or url field (up to the next hostmatch or urlmatch line).  hostmatch
+and urlmatch patterns must be unique in this property.
+
+Plugin order
+    TBD... But in most cases you will want this plugin to run last.
+
+Testing your match patterns
+    Online Regexp testers like http://www.regexplanet.com/advanced/java/index.html
+    can help get the basics of your pattern working.
+    To test in nutch: 
+        Prepare a test HTML file with the field contents you want to test. 
+        Place this in a directory accessible to nutch.
+        Use the file:/// syntax to list the test file(s) in a test/urls seed list.
+        See the nutch faq "index my local file system" for conf settings you will need.
+        (Note the urlmatch and hostmatch patterns may not conform to your test file host and url; This
+        test approach confirms only how your global matches behave, unless your urlmatch and hostmatch
+        patterns also match the file: URL pattern)
+ 
+    Run..
+        bin/nutch inject crawl/crawldb test
+        bin/nutch generate crawl/crawldb crawl/segments
+        bin/nutch fetch crawl/segments/[segment]
+        bin/nutch parse crawl/segments/[segment]
+
+    To inspect the returned fields...
+        bin/nutch readseg -dump crawl/segments/[segment] testout
+        less testout/dump
+
+    To retry: delete crawl/segments/[segment]/crawl_parse and repeat the parse and dump step.
\ No newline at end of file

Added: nutch/trunk/src/plugin/parse-replace/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/build.xml?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/build.xml (added)
+++ nutch/trunk/src/plugin/parse-replace/build.xml Thu Jul 16 00:59:41 2015
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-replace" default="jar-core">
+
+	<import file="../build-plugin.xml" />
+
+	<!-- Deploy Unit test dependencies -->
+	<target name="deps-test">
+		<ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" />
+		<ant target="deploy" inheritall="false" dir="../protocol-file" />
+	</target>
+
+
+	<!-- for junit test -->
+	<mkdir dir="${build.test}/data" />
+	<copy todir="${build.test}/data">
+		<fileset dir="sample">
+			<include name="*.html" />
+		</fileset>
+	</copy>
+
+</project>

Added: nutch/trunk/src/plugin/parse-replace/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/ivy.xml?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/ivy.xml (added)
+++ nutch/trunk/src/plugin/parse-replace/ivy.xml Thu Jul 16 00:59:41 2015
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/parse-replace/plugin.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/plugin.xml?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/plugin.xml (added)
+++ nutch/trunk/src/plugin/parse-replace/plugin.xml Thu Jul 16 00:59:41 2015
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-replace"
+   name="ReplaceParser"
+   version="1.0"
+   provider-name="PeterCiuffetti">
+
+   <runtime>
+      <library name="parse-replace.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.replace.parser"
+              name="Replace Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="ReplaceParser"
+                      class="org.apache.nutch.parse.replace.ReplaceParser"/>
+   </extension>
+
+</plugin>
+

Added: nutch/trunk/src/plugin/parse-replace/sample/testParseReplace.html
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/sample/testParseReplace.html?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/sample/testParseReplace.html (added)
+++ nutch/trunk/src/plugin/parse-replace/sample/testParseReplace.html Thu Jul 16 00:59:41 2015
@@ -0,0 +1,11 @@
+<html>
+  <head>
+    <title>Testing the power of parser-replace plugin</title>
+    <meta name="description" content="With this plugin, nutch is my bitch! Bwuhuhuhaha!">
+    <meta name="keywords" content="Awesome, Riveting, Two Thumbs Up!">
+    <meta name="author" content="Peter Ciuffetti">
+  </head>
+  <body>
+    <p>This html file is used to test the Nutch parse-replace regexp replacer plugin. A decidely boring thing to do.</p>
+  </body>
+</html>
\ No newline at end of file

Added: nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java (added)
+++ nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/ReplaceParser.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.replace;
+
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+/**
+ * Do pattern replacements on selected field contents
+ * prior to indexing.
+ */
+public class ReplaceParser implements HtmlParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(ReplaceParser.class
+      .getName());
+
+  private static Map<String, List<Object>> REPLACEPATTERNS_BY_HOST = new HashMap();
+  private static Map<String, List<Object>> REPLACEPATTERNS_BY_URL = new HashMap();
+
+  private Configuration conf;
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String[] values = conf.getStrings("parse.replace.regexp", null);
+    if (values != null) {
+      this.parseConf(values);
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  private void parseConf(String[] values) {
+	  
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+
+    return parseResult;
+  }
+}

Added: nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java (added)
+++ nutch/trunk/src/plugin/parse-replace/src/java/org/apache/nutch/parse/replace/package-info.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to allow pattern replacements on parsed metadata.
+ */
+package org.apache.nutch.parse.replace;
+

Added: nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java?rev=1691298&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java (added)
+++ nutch/trunk/src/plugin/parse-replace/src/test/org/apache/nutch/parse/replace/TestParseReplace.java Thu Jul 16 00:59:41 2015
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.replace;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestParseReplace {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testParseReplace.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+
+  public Metadata parseMeta(String fileName, Configuration conf) {
+    Metadata metadata = null;
+    try {
+      String urlString = "file:" + sampleDir + fileSeparator + fileName;
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      metadata = parse.getData().getParseMeta();
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.toString());
+    }
+    return metadata;
+  }
+
+  @Test
+  /** test defaults: keywords and description */
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+
+    // check that we get the same values
+    Metadata parseMeta = parseMeta(sampleFile, conf);
+
+    Assert.assertEquals(description, parseMeta.get("metatag.description"));
+    Assert.assertEquals(keywords, parseMeta.get("metatag.keywords"));
+  }
+}