You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:33 UTC

[49/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java
new file mode 100644
index 0000000..e7c55c4
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCIndexingFilter.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.CreativeCommons;
+
+import org.apache.nutch.parse.Parse;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+import java.net.URL;
+import java.net.MalformedURLException;
+
+/** Adds basic searchable fields to a document. */
+public class CCIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CCIndexingFilter.class);
+
+  /** The name of the document field we use. */
+  public static String FIELD = "cc";
+
+  private Configuration conf;
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    Metadata metadata = parse.getData().getParseMeta();
+    // index the license
+    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
+    if (licenseUrl != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
+      }
+
+      // add the entire license as cc:license=xxx
+      addFeature(doc, "license=" + licenseUrl);
+
+      // index license attributes extracted of the license url
+      addUrlFeatures(doc, licenseUrl);
+    }
+
+    // index the license location as cc:meta=xxx
+    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
+    if (licenseLocation != null) {
+      addFeature(doc, "meta=" + licenseLocation);
+    }
+
+    // index the work type cc:type=xxx
+    String workType = metadata.get(CreativeCommons.WORK_TYPE);
+    if (workType != null) {
+      addFeature(doc, workType);
+    }
+
+    return doc;
+  }
+
+  /**
+   * Add the features represented by a license URL. Urls are of the form
+   * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
+   * license feature.
+   */
+  public void addUrlFeatures(NutchDocument doc, String urlString) {
+    try {
+      URL url = new URL(urlString);
+
+      // tokenize the path of the url, breaking at slashes and dashes
+      StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
+
+      if (names.hasMoreTokens())
+        names.nextToken(); // throw away "licenses"
+
+      // add a feature per component after "licenses"
+      while (names.hasMoreTokens()) {
+        String feature = names.nextToken();
+        addFeature(doc, feature);
+      }
+    } catch (MalformedURLException e) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
+      }
+    }
+  }
+
+  private void addFeature(NutchDocument doc, String feature) {
+    doc.add(FIELD, feature);
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java
new file mode 100644
index 0000000..1fa951e
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/CCParseFilter.java
@@ -0,0 +1,300 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.CreativeCommons;
+import org.apache.nutch.parse.*;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.hadoop.conf.Configuration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.*;
+import java.io.*;
+import java.net.*;
+import javax.xml.parsers.*;
+import org.xml.sax.InputSource;
+import org.w3c.dom.*;
+
+/** Adds metadata identifying the Creative Commons license used, if any. */
+public class CCParseFilter implements HtmlParseFilter {
+  public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
+
+  /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
+  public static class Walker {
+    private URL base; // base url of page
+    private String rdfLicense; // subject url found, if any
+    private URL relLicense; // license url found, if any
+    private URL anchorLicense; // anchor url found, if any
+    private String workType; // work type URI
+
+    private Walker(URL base) {
+      this.base = base;
+    }
+
+    /** Scan the document adding attributes to metadata. */
+    public static void walk(Node doc, URL base, Metadata metadata,
+        Configuration conf) throws ParseException {
+
+      // walk the DOM tree, scanning for license data
+      Walker walker = new Walker(base);
+      walker.walk(doc);
+
+      // interpret results of walk
+      String licenseUrl = null;
+      String licenseLocation = null;
+      if (walker.rdfLicense != null) { // 1st choice: subject in RDF
+        licenseLocation = "rdf";
+        licenseUrl = walker.rdfLicense;
+      } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
+        licenseLocation = "rel";
+        licenseUrl = walker.relLicense.toString();
+      } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
+        licenseLocation = "a";
+        licenseUrl = walker.anchorLicense.toString();
+      } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
+        throw new ParseException("No CC license.  Excluding.");
+      }
+
+      // add license to metadata
+      if (licenseUrl != null) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("CC: found " + licenseUrl + " in " + licenseLocation
+              + " of " + base);
+        }
+        metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
+        metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
+      }
+
+      if (walker.workType != null) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("CC: found " + walker.workType + " in " + base);
+        }
+        metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
+      }
+
+    }
+
+    /** Scan the document looking for RDF in comments and license elements. */
+    private void walk(Node node) {
+
+      // check element nodes for license URL
+      if (node instanceof Element) {
+        findLicenseUrl((Element) node);
+      }
+
+      // check comment nodes for license RDF
+      if (node instanceof Comment) {
+        findRdf(((Comment) node).getData());
+      }
+
+      // recursively walk child nodes
+      NodeList children = node.getChildNodes();
+      for (int i = 0; children != null && i < children.getLength(); i++) {
+        walk(children.item(i));
+      }
+    }
+
+    /**
+     * Extract license url from element, if any. Thse are the href attribute of
+     * anchor elements with rel="license". These must also point to
+     * http://creativecommons.org/licenses/.
+     */
+    private void findLicenseUrl(Element element) {
+      // only look in Anchor elements
+      if (!"a".equalsIgnoreCase(element.getTagName()))
+        return;
+
+      // require an href
+      String href = element.getAttribute("href");
+      if (href == null)
+        return;
+
+      try {
+        URL url = new URL(base, href); // resolve the url
+
+        // check that it's a CC license URL
+        if ("http".equalsIgnoreCase(url.getProtocol())
+            && "creativecommons.org".equalsIgnoreCase(url.getHost())
+            && url.getPath() != null && url.getPath().startsWith("/licenses/")
+            && url.getPath().length() > "/licenses/".length()) {
+
+          // check rel="license"
+          String rel = element.getAttribute("rel");
+          if (rel != null && "license".equals(rel) && this.relLicense == null) {
+            this.relLicense = url; // found rel license
+          } else if (this.anchorLicense == null) {
+            this.anchorLicense = url; // found anchor license
+          }
+        }
+      } catch (MalformedURLException e) { // ignore malformed urls
+      }
+    }
+
+    /** Configure a namespace aware XML parser. */
+    private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+        .newInstance();
+    static {
+      FACTORY.setNamespaceAware(true);
+    }
+
+    /** Creative Commons' namespace URI. */
+    private static final String CC_NS = "http://web.resource.org/cc/";
+
+    /** Dublin Core namespace URI. */
+    private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
+
+    /** RDF syntax namespace URI. */
+    private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+
+    private void findRdf(String comment) {
+      // first check for likely RDF in comment
+      int rdfPosition = comment.indexOf("RDF");
+      if (rdfPosition < 0)
+        return; // no RDF, abort
+      int nsPosition = comment.indexOf(CC_NS);
+      if (nsPosition < 0)
+        return; // no RDF, abort
+
+      // try to parse the XML
+      Document doc;
+      try {
+        DocumentBuilder parser = FACTORY.newDocumentBuilder();
+        doc = parser.parse(new InputSource(new StringReader(comment)));
+      } catch (Exception e) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
+        }
+        // e.printStackTrace();
+        return;
+      }
+
+      // check that root is rdf:RDF
+      NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
+      if (roots.getLength() != 1) {
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: No RDF root in " + base);
+        }
+        return;
+      }
+      Element rdf = (Element) roots.item(0);
+
+      // get cc:License nodes inside rdf:RDF
+      NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
+      for (int i = 0; i < licenses.getLength(); i++) {
+
+        Element l = (Element) licenses.item(i);
+
+        // license is rdf:about= attribute from cc:License
+        this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
+
+        // walk predicates of cc:License
+        NodeList predicates = l.getChildNodes();
+        for (int j = 0; j < predicates.getLength(); j++) {
+          Node predicateNode = predicates.item(j);
+          if (!(predicateNode instanceof Element))
+            continue;
+          Element predicateElement = (Element) predicateNode;
+
+          // extract predicates of cc:xxx predicates
+          if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
+            continue;
+          }
+
+          // add object and predicate to metadata
+          // metadata.put(object, predicate);
+          // if (LOG.isInfoEnabled()) {
+          // LOG.info("CC: found: "+predicate+"="+object);
+          // }
+        }
+      }
+
+      // get cc:Work nodes from rdf:RDF
+      NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
+      for (int i = 0; i < works.getLength(); i++) {
+        // get dc:type nodes from cc:Work
+        NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
+
+        for (int j = 0; j < types.getLength(); j++) {
+          Element type = (Element) types.item(j);
+          String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
+              .getValue();
+          this.workType = WORK_TYPE_NAMES.get(workUri);
+        }
+      }
+    }
+  }
+
+  private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();
+  static {
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+        "interactive");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
+  }
+
+  private Configuration conf;
+
+  /**
+   * Adds metadata or otherwise modifies a parse of an HTML document, given the
+   * DOM tree of a page.
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // get parse obj
+    Parse parse = parseResult.get(content.getUrl());
+
+    // construct base url
+    URL base;
+    try {
+      base = new URL(content.getBaseUrl());
+    } catch (MalformedURLException e) {
+      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
+      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+          emptyParse.getData());
+      return parseResult;
+    }
+
+    try {
+      // extract license metadata
+      Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
+    } catch (ParseException e) {
+      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
+      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+          emptyParse.getData());
+      return parseResult;
+    }
+
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html
new file mode 100644
index 0000000..0c91293
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/main/java/org/creativecommons/nutch/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Sample plugins that parse and index Creative Commons medadata.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
new file mode 100755
index 0000000..41be9ed
--- /dev/null
+++ b/nutch-plugins/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
@@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.creativecommons.nutch;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.io.*;
+
+public class TestCCParseFilter {
+
+  private static final File testDir = new File(System.getProperty("test.input"));
+
+  @Test
+  public void testPages() throws Exception {
+    pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+    // Tika returns <a> whereas parse-html returns <rel>
+    // check later
+    pageTest(new File(testDir, "rel.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+    // Tika returns <a> whereas parse-html returns <rdf>
+    // check later
+    pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
+        "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+  }
+
+  public void pageTest(File file, String url, String license, String location,
+      String type) throws Exception {
+
+    String contentType = "text/html";
+    InputStream in = new FileInputStream(file);
+    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
+    byte[] buffer = new byte[1024];
+    int i;
+    while ((i = in.read(buffer)) != -1) {
+      out.write(buffer, 0, i);
+    }
+    in.close();
+    byte[] bytes = out.toByteArray();
+    Configuration conf = NutchConfiguration.create();
+
+    Content content = new Content(url, url, bytes, contentType, new Metadata(),
+        conf);
+    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
+    Metadata metadata = parse.getData().getParseMeta();
+    Assert.assertEquals(license, metadata.get("License-Url"));
+    Assert.assertEquals(location, metadata.get("License-Location"));
+    Assert.assertEquals(type, metadata.get("Work-Type"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/build.xml b/nutch-plugins/feed/build.xml
new file mode 100644
index 0000000..7fe7050
--- /dev/null
+++ b/nutch-plugins/feed/build.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+
+<project name="feed" default="jar-core">
+
+    <import file="../build-plugin.xml" />
+    
+    <!-- Build compilation dependencies -->
+    <target name="deps-jar">
+      <ant target="jar" inheritall="false" dir="../lib-xml"/>
+    </target>
+
+    <!-- Add compilation dependencies to classpath -->
+    <path id="plugin.deps">
+      <fileset dir="${nutch.root}/build">
+        <include name="**/lib-xml/*.jar" />
+      </fileset>
+    </path>
+
+    <!-- Deploy Unit test dependencies -->
+    <target name="deps-test">
+      <ant target="deploy" inheritall="false"
+           dir="../nutch-extensionpoints" />
+      <ant target="deploy" inheritall="false" dir="../protocol-file" />
+    </target>
+    
+    <!-- for junit test -->
+    <mkdir dir="${build.test}/data" />
+    <copy file="sample/rsstest.rss" todir="${build.test}/data" />
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/ivy.xml b/nutch-plugins/feed/ivy.xml
new file mode 100644
index 0000000..c29bd03
--- /dev/null
+++ b/nutch-plugins/feed/ivy.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="rome" name="rome" rev="0.9" conf="*->master"/>
+    <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->master"/>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/plugin.xml b/nutch-plugins/feed/plugin.xml
new file mode 100644
index 0000000..3a68d8d
--- /dev/null
+++ b/nutch-plugins/feed/plugin.xml
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0"
+	 provider-name="nutch.org">
+    <runtime>
+      <library name="feed.jar">
+        <export name="*" />
+      </library>
+      <library name="rome-0.9.jar" />
+      <library name="jdom-1.1.jar" />
+    </runtime>
+    
+    <requires>
+      <import plugin="nutch-extensionpoints" />
+      <import plugin="lib-xml" />
+    </requires>
+    
+    <extension id="org.apache.nutch.parse.feed" name="Feed Parser"
+      point="org.apache.nutch.parse.Parser">
+      
+      <implementation id="org.apache.nutch.parse.feed.FeedParser"
+         class="org.apache.nutch.parse.feed.FeedParser">
+         <parameter name="contentType" value="application/rss+xml" />
+         <parameter name="contentType" value="application/atom+xml" />
+         <parameter name="contentType" value="text/xml" />
+         <parameter name="pathSuffix" value="rss" />
+     </implementation>
+    </extension>
+    <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer"
+       point="org.apache.nutch.indexer.IndexingFilter">
+     <implementation id="FeedIndexingFilter"
+       class="org.apache.nutch.indexer.feed.FeedIndexingFilter" />
+    </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/pom.xml b/nutch-plugins/feed/pom.xml
new file mode 100644
index 0000000..d94c0b6
--- /dev/null
+++ b/nutch-plugins/feed/pom.xml
@@ -0,0 +1,45 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>feed</artifactId>
+    <packaging>jar</packaging>
+
+    <name>feed</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>rome</groupId>
+            <artifactId>rome</artifactId>
+            <version>1.0</version>
+        </dependency>
+    </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/sample/rsstest.rss
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/sample/rsstest.rss b/nutch-plugins/feed/sample/rsstest.rss
new file mode 100644
index 0000000..758f6a1
--- /dev/null
+++ b/nutch-plugins/feed/sample/rsstest.rss
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
new file mode 100644
index 0000000..94b440a
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
@@ -0,0 +1,129 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.feed;
+
+//JDK imports
+import java.util.Date;
+
+//APACHE imports
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+
+/**
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ * 
+ *        An {@link IndexingFilter} implementation to pull out the relevant
+ *        extracted {@link Metadata} fields from the RSS feeds and into the
+ *        index.
+ * 
+ */
+public class FeedIndexingFilter implements IndexingFilter {
+
+  public static final String dateFormatStr = "yyyyMMddHHmm";
+
+  private Configuration conf;
+
+  private final static String PUBLISHED_DATE = "publishedDate";
+
+  private final static String UPDATED_DATE = "updatedDate";
+
+  /**
+   * Extracts out the relevant fields:
+   * 
+   * <ul>
+   * <li>FEED_AUTHOR</li>
+   * <li>FEED_TAGS</li>
+   * <li>FEED_PUBLISHED</li>
+   * <li>FEED_UPDATED</li>
+   * <li>FEED</li>
+   * </ul>
+   * 
+   * And sends them to the {@link Indexer} for indexing within the Nutch index.
+   * 
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    ParseData parseData = parse.getData();
+    Metadata parseMeta = parseData.getParseMeta();
+
+    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
+    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
+    String published = parseMeta.get(Feed.FEED_PUBLISHED);
+    String updated = parseMeta.get(Feed.FEED_UPDATED);
+    String feed = parseMeta.get(Feed.FEED);
+
+    if (authors != null) {
+      for (String author : authors) {
+        doc.add(Feed.FEED_AUTHOR, author);
+      }
+    }
+
+    if (tags != null) {
+      for (String tag : tags) {
+        doc.add(Feed.FEED_TAGS, tag);
+      }
+    }
+
+    if (feed != null)
+      doc.add(Feed.FEED, feed);
+
+    if (published != null) {
+      Date date = new Date(Long.parseLong(published));
+      doc.add(PUBLISHED_DATE, date);
+    }
+
+    if (updated != null) {
+      Date date = new Date(Long.parseLong(updated));
+      doc.add(UPDATED_DATE, date);
+    }
+
+    return doc;
+  }
+
+  /**
+   * @return the {@link Configuration} object used to configure this
+   *         {@link IndexingFilter}.
+   */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * Sets the {@link Configuration} object used to configure this
+   * {@link IndexingFilter}.
+   * 
+   * @param conf
+   *          The {@link Configuration} object used to configure this
+   *          {@link IndexingFilter}.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java
new file mode 100644
index 0000000..8f52628
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/indexer/feed/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to index meta data from RSS feeds.
+ */
+package org.apache.nutch.indexer.feed;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java
new file mode 100644
index 0000000..936c885
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/FeedParser.java
@@ -0,0 +1,374 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.io.ByteArrayInputStream;
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
+
+// APACHE imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.util.StringUtils;
+// import org.apache.nutch.indexer.anchor.AnchorIndexingFilter; removed as per NUTCH-1078
+import org.apache.nutch.metadata.Feed;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLFilters;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.parse.Parser;
+import org.apache.nutch.parse.ParserFactory;
+import org.apache.nutch.parse.ParserNotFound;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.EncodingDetector;
+import org.apache.nutch.util.NutchConfiguration;
+import org.xml.sax.InputSource;
+
+// ROME imports
+import com.sun.syndication.feed.synd.SyndCategory;
+import com.sun.syndication.feed.synd.SyndContent;
+import com.sun.syndication.feed.synd.SyndEntry;
+import com.sun.syndication.feed.synd.SyndFeed;
+import com.sun.syndication.feed.synd.SyndPerson;
+import com.sun.syndication.io.SyndFeedInput;
+
+/**
+ * 
+ * @author dogacan
+ * @author mattmann
+ * @since NUTCH-444
+ * 
+ *        <p>
+ *        A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
+ *        links and content present in the feed.
+ *        </p>
+ * 
+ */
+public class FeedParser implements Parser {
+
+  public static final String CHARSET_UTF8 = "charset=UTF-8";
+
+  public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
+      + CHARSET_UTF8;
+
+  public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class);
+
+  private Configuration conf;
+
+  private ParserFactory parserFactory;
+
+  private URLNormalizers normalizers;
+
+  private URLFilters filters;
+
+  private String defaultEncoding;
+
+  /**
+   * Parses the given feed and extracts out and parsers all linked items within
+   * the feed, using the underlying ROME feed parsing library.
+   * 
+   * @param content
+   *          A {@link Content} object representing the feed that is being
+   *          parsed by this {@link Parser}.
+   * 
+   * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
+   *         present in the feed file that this {@link Parser} dealt with.
+   * 
+   */
+  public ParseResult getParse(Content content) {
+    SyndFeed feed = null;
+    ParseResult parseResult = new ParseResult(content.getUrl());
+
+    EncodingDetector detector = new EncodingDetector(conf);
+    detector.autoDetectClues(content, true);
+    String encoding = detector.guessEncoding(content, defaultEncoding);
+    try {
+      InputSource input = new InputSource(new ByteArrayInputStream(
+          content.getContent()));
+      input.setEncoding(encoding);
+      SyndFeedInput feedInput = new SyndFeedInput();
+      feed = feedInput.build(input);
+    } catch (Exception e) {
+      // return empty parse
+      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
+          + StringUtils.stringifyException(e));
+      return new ParseStatus(e)
+          .getEmptyParseResult(content.getUrl(), getConf());
+    }
+
+    String feedLink = feed.getLink();
+    try {
+      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
+      if (feedLink != null)
+        feedLink = filters.filter(feedLink);
+    } catch (Exception e) {
+      feedLink = null;
+    }
+
+    List<?> entries = feed.getEntries();
+    for (Object entry : entries) {
+      addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
+    }
+
+    String feedDesc = stripTags(feed.getDescriptionEx());
+    String feedTitle = stripTags(feed.getTitleEx());
+
+    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
+        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
+        content.getMetadata()));
+
+    return parseResult;
+  }
+
+  /**
+   * 
+   * Sets the {@link Configuration} object for this {@link Parser}. This
+   * {@link Parser} expects the following configuration properties to be set:
+   * 
+   * <ul>
+   * <li>URLNormalizers - properties in the configuration object to set up the
+   * default url normalizers.</li>
+   * <li>URLFilters - properties in the configuration object to set up the
+   * default url filters.</li>
+   * </ul>
+   * 
+   * @param conf
+   *          The Hadoop {@link Configuration} object to use to configure this
+   *          {@link Parser}.
+   * 
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.parserFactory = new ParserFactory(conf);
+    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
+    this.filters = new URLFilters(conf);
+    this.defaultEncoding = conf.get("parser.character.encoding.default",
+        "windows-1252");
+  }
+
+  /**
+   * 
+   * @return The {@link Configuration} object used to configure this
+   *         {@link Parser}.
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Runs a command line version of this {@link Parser}.
+   * 
+   * @param args
+   *          A single argument (expected at arg[0]) representing a path on the
+   *          local filesystem that points to a feed file.
+   * 
+   * @throws Exception
+   *           If any error occurs.
+   */
+  public static void main(String[] args) throws Exception {
+    if (args.length != 1) {
+      System.err.println("Usage: FeedParser <feed>");
+      System.exit(1);
+    }
+    String name = args[0];
+    String url = "file:" + name;
+    Configuration conf = NutchConfiguration.create();
+    FeedParser parser = new FeedParser();
+    parser.setConf(conf);
+    File file = new File(name);
+    byte[] bytes = new byte[(int) file.length()];
+    DataInputStream in = new DataInputStream(new FileInputStream(file));
+    in.readFully(bytes);
+    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
+        "application/rss+xml", new Metadata(), conf));
+    for (Entry<Text, Parse> entry : parseResult) {
+      System.out.println("key: " + entry.getKey());
+      Parse parse = entry.getValue();
+      System.out.println("data: " + parse.getData());
+      System.out.println("text: " + parse.getText() + "\n");
+    }
+  }
+
+  private void addToMap(ParseResult parseResult, SyndFeed feed,
+      String feedLink, SyndEntry entry, Content content) {
+    String link = entry.getLink(), text = null, title = null;
+    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
+    Parse parse = null;
+    SyndContent description = entry.getDescription();
+
+    try {
+      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
+
+      if (link != null)
+        link = filters.filter(link);
+    } catch (Exception e) {
+      e.printStackTrace();
+      return;
+    }
+
+    if (link == null)
+      return;
+
+    title = stripTags(entry.getTitleEx());
+
+    if (feedLink != null)
+      parseMeta.set("feed", feedLink);
+
+    addFields(parseMeta, contentMeta, feed, entry);
+
+    // some item descriptions contain markup text in them,
+    // so we temporarily set their content-type to parse them
+    // with another plugin
+    String contentType = contentMeta.get(Response.CONTENT_TYPE);
+
+    if (description != null)
+      text = description.getValue();
+
+    if (text == null) {
+      List<?> contents = entry.getContents();
+      StringBuilder buf = new StringBuilder();
+      for (Object syndContent : contents) {
+        buf.append(((SyndContent) syndContent).getValue());
+      }
+      text = buf.toString();
+    }
+
+    try {
+      Parser parser = parserFactory.getParsers(contentType, link)[0];
+      parse = parser.getParse(
+          new Content(link, link, text.getBytes(), contentType, contentMeta,
+              conf)).get(link);
+    } catch (ParserNotFound e) { /* ignore */
+    }
+
+    if (parse != null) {
+      ParseData data = parse.getData();
+      data.getContentMeta().remove(Response.CONTENT_TYPE);
+      mergeMetadata(data.getParseMeta(), parseMeta);
+      parseResult.put(link, new ParseText(parse.getText()),
+          new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
+              data.getContentMeta(), data.getParseMeta()));
+    } else {
+      contentMeta.remove(Response.CONTENT_TYPE);
+      parseResult.put(link, new ParseText(text), new ParseData(
+          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
+          parseMeta));
+    }
+
+  }
+
+  private static String stripTags(SyndContent c) {
+    if (c == null)
+      return "";
+
+    String value = c.getValue();
+
+    String[] parts = value.split("<[^>]*>");
+    StringBuffer buf = new StringBuffer();
+
+    for (String part : parts)
+      buf.append(part);
+
+    return buf.toString().trim();
+  }
+
+  private void addFields(Metadata parseMeta, Metadata contentMeta,
+      SyndFeed feed, SyndEntry entry) {
+    List<?> authors = entry.getAuthors(), categories = entry.getCategories();
+    Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
+    String contentType = null;
+
+    if (authors != null) {
+      for (Object o : authors) {
+        SyndPerson author = (SyndPerson) o;
+        String authorName = author.getName();
+        if (checkString(authorName)) {
+          parseMeta.add(Feed.FEED_AUTHOR, authorName);
+        }
+      }
+    } else {
+      // getAuthors may return null if feed is non-atom
+      // if so, call getAuthor to get Dublin Core module creator.
+      String authorName = entry.getAuthor();
+      if (checkString(authorName)) {
+        parseMeta.set(Feed.FEED_AUTHOR, authorName);
+      }
+    }
+
+    for (Object i : categories) {
+      parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
+    }
+
+    if (published != null) {
+      parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
+    }
+    if (updated != null) {
+      parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
+    }
+
+    SyndContent description = entry.getDescription();
+    if (description != null) {
+      contentType = description.getType();
+    } else {
+      // TODO: What to do if contents.size() > 1?
+      List<?> contents = entry.getContents();
+      if (contents.size() > 0) {
+        contentType = ((SyndContent) contents.get(0)).getType();
+      }
+    }
+
+    if (checkString(contentType)) {
+      // ROME may return content-type as html
+      if (contentType.equals("html"))
+        contentType = "text/html";
+      else if (contentType.equals("xhtml"))
+        contentType = "text/xhtml";
+      contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
+    } else {
+      contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
+    }
+
+  }
+
+  private void mergeMetadata(Metadata first, Metadata second) {
+    for (String name : second.names()) {
+      String[] values = second.getValues(name);
+      for (String value : values) {
+        first.add(name, value);
+      }
+    }
+  }
+
+  private boolean checkString(String s) {
+    return s != null && !s.equals("");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java
new file mode 100644
index 0000000..3b15968
--- /dev/null
+++ b/nutch-plugins/feed/src/main/java/org/apache/nutch/parse/feed/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse RSS feeds.
+ */
+package org.apache.nutch.parse.feed;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
new file mode 100644
index 0000000..36c8739
--- /dev/null
+++ b/nutch-plugins/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.feed;
+
+// JDK imports
+import java.util.Iterator;
+import java.util.Map;
+
+import org.junit.Assert;
+import org.junit.Test;
+// APACHE imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.ProtocolNotFound;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann
+ * 
+ *         Test Suite for the {@link FeedParser}.
+ * 
+ */
+public class TestFeedParser {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/feed/build.xml during plugin compilation.
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
+      .getName());
+
+  /**
+   * Calls the {@link FeedParser} on a sample RSS file and checks that there are
+   * 3 {@link ParseResult} entries including the below 2 links:
+   * <ul>
+   * <li>http://www-scf.usc.edu/~mattmann/</li>
+   * <li>http://www.nutch.org</li>
+   * </ul>
+   * 
+   * 
+   * @throws ProtocolNotFound
+   *           If the {@link Protocol}Layer cannot be loaded (required to fetch
+   *           the {@link Content} for the RSS file).
+   * @throws ParseException
+   *           If the {@link Parser}Layer cannot be loaded.
+   */
+  @Test
+  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
+    String urlString;
+    Protocol protocol;
+    Content content;
+    ParseResult parseResult;
+
+    Configuration conf = NutchConfiguration.create();
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+      urlString = urlString.replace('\\', '/');
+
+      protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+
+      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
+
+      Assert.assertEquals(3, parseResult.size());
+
+      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
+
+      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
+          .hasNext();) {
+        Map.Entry<Text, Parse> entry = j.next();
+        if (entry.getKey().toString()
+            .equals("http://www-scf.usc.edu/~mattmann/")) {
+          hasLink1 = true;
+        } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
+          hasLink2 = true;
+        } else if (entry.getKey().toString().equals(urlString)) {
+          hasLink3 = true;
+        }
+
+        Assert.assertNotNull(entry.getValue());
+        Assert.assertNotNull(entry.getValue().getData());
+      }
+
+      if (!hasLink1 || !hasLink2 || !hasLink3) {
+        Assert.fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/build.xml b/nutch-plugins/headings/build.xml
new file mode 100644
index 0000000..d334ad1
--- /dev/null
+++ b/nutch-plugins/headings/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="headings" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/ivy.xml b/nutch-plugins/headings/ivy.xml
new file mode 100644
index 0000000..5b8393b
--- /dev/null
+++ b/nutch-plugins/headings/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+      <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/plugin.xml b/nutch-plugins/headings/plugin.xml
new file mode 100644
index 0000000..0d7921a
--- /dev/null
+++ b/nutch-plugins/headings/plugin.xml
@@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="headings"
+   name="Headings Parse Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="headings.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.parse.headings"
+              name="Nutch Headings Parse Filter"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+
+      <implementation id="HeadingsParseFilter"
+                      class="org.apache.nutch.parse.headings.HeadingsParseFilter">
+      </implementation>
+
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/pom.xml b/nutch-plugins/headings/pom.xml
new file mode 100644
index 0000000..219eb71
--- /dev/null
+++ b/nutch-plugins/headings/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>headings</artifactId>
+    <packaging>jar</packaging>
+
+    <name>headings</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
new file mode 100644
index 0000000..657f260
--- /dev/null
+++ b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.headings;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.parse.HTMLMetaTags;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.HtmlParseFilter;
+import org.apache.nutch.parse.ParseResult;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NodeWalker;
+import org.w3c.dom.*;
+
+/**
+ * HtmlParseFilter to retrieve h1 and h2 values from the DOM.
+ */
+public class HeadingsParseFilter implements HtmlParseFilter {
+
+  /**
+   * Pattern used to strip surpluss whitespace
+   */
+  protected static Pattern whitespacePattern = Pattern.compile("\\s+");
+
+  private Configuration conf;
+  private String[] headings;
+  private boolean multiValued = false;
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+    Parse parse = parseResult.get(content.getUrl());
+
+    for (int i = 0; headings != null && i < headings.length; i++) {
+      List<String> discoveredHeadings = getElement(doc, headings[i]);
+
+      if (discoveredHeadings.size() > 0) {
+        for (String heading : discoveredHeadings) {
+          if (heading != null) {
+            heading.trim();
+
+            if (heading.length() > 0) {
+              parse.getData().getParseMeta().add(headings[i], heading);
+            }
+          }
+        }
+      }
+    }
+
+    return parseResult;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    headings = conf.getStrings("headings");
+    multiValued = conf.getBoolean("headings.multivalued", false);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Finds the specified element and returns its value
+   */
+  protected List<String> getElement(DocumentFragment doc, String element) {
+    List<String> headings = new ArrayList<String>();
+    NodeWalker walker = new NodeWalker(doc);
+
+    while (walker.hasNext()) {
+      Node currentNode = walker.nextNode();
+
+      if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
+        if (element.equalsIgnoreCase(currentNode.getNodeName())) {
+          headings.add(getNodeValue(currentNode));
+
+          // Check for multiValued here, if disabled we don't need
+          // to discover more headings.
+          if (!multiValued) {
+            break;
+          }
+        }
+      }
+    }
+
+    return headings;
+  }
+
+  /**
+   * Returns the text value of the specified Node and child nodes
+   */
+  protected static String getNodeValue(Node node) {
+    StringBuilder buffer = new StringBuilder();
+
+    NodeList children = node.getChildNodes();
+
+    for (int i = 0; i < children.getLength(); i++) {
+      if (children.item(i).getNodeType() == Node.TEXT_NODE) {
+        buffer.append(children.item(i).getNodeValue());
+      }
+    }
+
+    // Return with stripped surplus whitespace
+    Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
+    return matcher.replaceAll(" ").trim();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java
new file mode 100644
index 0000000..363e0b2
--- /dev/null
+++ b/nutch-plugins/headings/src/main/java/org/apache/nutch/parse/headings/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.
+ */
+package org.apache.nutch.parse.headings;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/build.xml b/nutch-plugins/index-anchor/build.xml
new file mode 100644
index 0000000..597b532
--- /dev/null
+++ b/nutch-plugins/index-anchor/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-anchor" default="jar-core">
+
+  <import file="../build-plugin.xml" />
+
+</project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/ivy.xml b/nutch-plugins/index-anchor/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-anchor/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/plugin.xml b/nutch-plugins/index-anchor/plugin.xml
new file mode 100644
index 0000000..208594b
--- /dev/null
+++ b/nutch-plugins/index-anchor/plugin.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+  
+  http://www.apache.org/licenses/LICENSE-2.0
+  
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<plugin id="index-anchor" name="Anchor Indexing Filter" version="1.0.0"
+  provider-name="nutch.org">
+
+  <runtime>
+    <library name="index-anchor.jar">
+      <export name="*" />
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints" />
+  </requires>
+
+  <extension id="org.apache.nutch.indexer.anchor"
+    name="Nutch Anchor Indexing Filter"
+    point="org.apache.nutch.indexer.IndexingFilter">
+    <implementation id="AnchorIndexingFilter"
+      class="org.apache.nutch.indexer.anchor.AnchorIndexingFilter" />
+  </extension>
+
+</plugin>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/pom.xml b/nutch-plugins/index-anchor/pom.xml
new file mode 100644
index 0000000..df01a61
--- /dev/null
+++ b/nutch-plugins/index-anchor/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-anchor</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-anchor</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
new file mode 100644
index 0000000..6c9b834
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import java.util.HashSet;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Indexing filter that offers an option to either index all inbound anchor text
+ * for a document or deduplicate anchors. Deduplication does have it's con's,
+ * 
+ * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+ */
+public class AnchorIndexingFilter implements IndexingFilter {
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(AnchorIndexingFilter.class);
+  private Configuration conf;
+  private boolean deduplicate = false;
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
+    LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * The {@link AnchorIndexingFilter} filter object which supports boolean
+   * configuration settings for the deduplication of anchors. See
+   * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
+
+    HashSet<String> set = null;
+
+    for (int i = 0; i < anchors.length; i++) {
+      if (deduplicate) {
+        if (set == null)
+          set = new HashSet<String>();
+        String lcAnchor = anchors[i].toLowerCase();
+
+        // Check if already processed the current anchor
+        if (!set.contains(lcAnchor)) {
+          doc.add("anchor", anchors[i]);
+
+          // Add to map
+          set.add(lcAnchor);
+        }
+      } else {
+        doc.add("anchor", anchors[i]);
+      }
+    }
+
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html
new file mode 100644
index 0000000..c255029
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/main/java/org/apache/nutch/indexer/anchor/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>An indexing plugin for inbound anchor text.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
new file mode 100644
index 0000000..08a42f3
--- /dev/null
+++ b/nutch-plugins/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.anchor;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
+ * 
+ * @author lewismc
+ * 
+ */
+public class TestAnchorIndexingFilter {
+
+  @Test
+  public void testDeduplicateAnchor() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
+    AnchorIndexingFilter filter = new AnchorIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+    NutchDocument doc = new NutchDocument();
+    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://test1.com/", "text1"));
+    inlinks.add(new Inlink("http://test2.com/", "text2"));
+    inlinks.add(new Inlink("http://test3.com/", "text2"));
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          new CrawlDatum(), inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
+        .contains("anchor"));
+    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
+        .getValues().size());
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/build.xml b/nutch-plugins/index-basic/build.xml
new file mode 100755
index 0000000..a834290
--- /dev/null
+++ b/nutch-plugins/index-basic/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-basic" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/ivy.xml b/nutch-plugins/index-basic/ivy.xml
new file mode 100644
index 0000000..848216e
--- /dev/null
+++ b/nutch-plugins/index-basic/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+      <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/plugin.xml b/nutch-plugins/index-basic/plugin.xml
new file mode 100755
index 0000000..c5d784d
--- /dev/null
+++ b/nutch-plugins/index-basic/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-basic"
+   name="Basic Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-basic.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.basic"
+              name="Nutch Basic Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="BasicIndexingFilter"
+                      class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/>
+   </extension>
+
+</plugin>