You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:32 UTC

[48/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/pom.xml b/nutch-plugins/index-basic/pom.xml
new file mode 100644
index 0000000..3dc3d91
--- /dev/null
+++ b/nutch-plugins/index-basic/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-basic</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-basic</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
new file mode 100644
index 0000000..8584fa8
--- /dev/null
+++ b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
@@ -0,0 +1,158 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.basic;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.parse.Parse;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
+import org.apache.hadoop.io.Text;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Date;
+
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * Adds basic searchable fields to a document. The fields added are : domain,
+ * host, url, content, title, cache, tstamp domain is included depending on
+ * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a
+ * zero-length title is not added) content is truncated as per
+ * {@code indexer.max.content.length} in nutch-default.xml.
+ */
+public class BasicIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(BasicIndexingFilter.class);
+
+  private int MAX_TITLE_LENGTH;
+  private int MAX_CONTENT_LENGTH;
+  private boolean addDomain = false;
+  private Configuration conf;
+
+  /**
+   * The {@link BasicIndexingFilter} filter object which supports few
+   * configuration settings for adding basic searchable fields. See
+   * {@code indexer.add.domain}, {@code indexer.max.title.length},
+   * {@code indexer.max.content.length} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
+    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
+    String urlString = url.toString();
+
+    String host = null;
+    try {
+      URL u;
+      if (reprUrlString != null) {
+        u = new URL(reprUrlString);
+      } else {
+        u = new URL(urlString);
+      }
+
+      if (addDomain) {
+        doc.add("domain", URLUtil.getDomainName(u));
+      }
+
+      host = u.getHost();
+    } catch (MalformedURLException e) {
+      throw new IndexingException(e);
+    }
+
+    if (host != null) {
+      doc.add("host", host);
+    }
+
+    doc.add("url", reprUrlString == null ? urlString : reprUrlString);
+
+    // content
+    String content = parse.getText();
+    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
+      content = content.substring(0, MAX_CONTENT_LENGTH);
+    }
+    doc.add("content", StringUtil.cleanField(content));
+
+    // title
+    String title = parse.getData().getTitle();
+    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
+                                                                      // title
+                                                                      // if
+                                                                      // needed
+      title = title.substring(0, MAX_TITLE_LENGTH);
+    }
+
+    if (title.length() > 0) {
+      // NUTCH-1004 Do not index empty values for title field
+      doc.add("title", StringUtil.cleanField(title));
+    }
+
+    // add cached content/summary display policy, if available
+    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
+    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
+      doc.add("cache", caching);
+    }
+
+    // add timestamp when fetched, for deduplication
+    doc.add("tstamp", new Date(datum.getFetchTime()));
+
+    return doc;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+    this.addDomain = conf.getBoolean("indexer.add.domain", false);
+    this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html
new file mode 100644
index 0000000..3fae405
--- /dev/null
+++ b/nutch-plugins/index-basic/src/main/java/org/apache/nutch/indexer/basic/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
new file mode 100644
index 0000000..4bc317e
--- /dev/null
+++ b/nutch-plugins/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.basic.BasicIndexingFilter;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+import java.util.Date;
+
+/**
+ * JUnit test case which tests 1. that basic searchable fields are added to a
+ * document 2. that domain is added as per {@code indexer.add.domain} in
+ * nutch-default.xml. 3. that title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
+ * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ * 
+ * @author tejasp
+ * 
+ */
+
+public class TestBasicIndexingFilter {
+
+  @Test
+  public void testBasicIndexingFilter() throws Exception {
+    Configuration conf = NutchConfiguration.create();
+    conf.setInt("indexer.max.title.length", 10);
+    conf.setBoolean("indexer.add.domain", true);
+    conf.setInt("indexer.max.content.length", 20);
+
+    BasicIndexingFilter filter = new BasicIndexingFilter();
+    filter.setConf(conf);
+    Assert.assertNotNull(filter);
+
+    NutchDocument doc = new NutchDocument();
+
+    String title = "The Foo Page";
+    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
+    Metadata metaData = new Metadata();
+    metaData.add("Language", "en/us");
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, metaData);
+    ParseImpl parse = new ParseImpl(
+        "this is a sample foo bar page. hope you enjoy it.", parseData);
+
+    CrawlDatum crawlDatum = new CrawlDatum();
+    crawlDatum.setFetchTime(100L);
+
+    Inlinks inlinks = new Inlinks();
+
+    try {
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          crawlDatum, inlinks);
+    } catch (Exception e) {
+      e.printStackTrace();
+      Assert.fail(e.getMessage());
+    }
+    Assert.assertNotNull(doc);
+    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
+        .getField("title").getValues().get(0));
+    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
+        .getField("domain").getValues().get(0));
+    Assert.assertEquals("test host, expect \"nutch.apache.org\"",
+        "nutch.apache.org", doc.getField("host").getValues().get(0));
+    Assert.assertEquals(
+        "test url, expect \"http://nutch.apache.org/index.html\"",
+        "http://nutch.apache.org/index.html", doc.getField("url").getValues()
+            .get(0));
+    Assert.assertEquals("test content", "this is a sample foo",
+        doc.getField("content").getValues().get(0));
+    Assert.assertEquals("test fetch time", new Date(100L),
+        (Date) doc.getField("tstamp").getValues().get(0));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/build-ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/build-ivy.xml b/nutch-plugins/index-geoip/build-ivy.xml
new file mode 100644
index 0000000..2cda7e9
--- /dev/null
+++ b/nutch-plugins/index-geoip/build-ivy.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
+
+    <property name="ivy.install.version" value="2.1.0" />
+    <condition property="ivy.home" value="${env.IVY_HOME}">
+      <isset property="env.IVY_HOME" />
+    </condition>
+    <property name="ivy.home" value="${user.home}/.ant" />
+    <property name="ivy.checksums" value="" />
+    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
+    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
+
+    <target name="download-ivy" unless="offline">
+
+        <mkdir dir="${ivy.jar.dir}"/>
+        <!-- download Ivy from web site so that it can be used even without any special installation -->
+        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
+             dest="${ivy.jar.file}" usetimestamp="true"/>
+    </target>
+
+    <target name="init-ivy" depends="download-ivy">
+      <!-- try to load ivy here from ivy home, in case the user has not already dropped
+              it into ant's lib dir (note that the latter copy will always take precedence).
+              We will not fail as long as local lib dir exists (it may be empty) and
+              ivy is in at least one of ant's lib dir or the local lib dir. -->
+        <path id="ivy.lib.path">
+            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
+
+        </path>
+        <taskdef resource="org/apache/ivy/ant/antlib.xml"
+                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
+    </target>
+
+  <target name="deps-jar" depends="init-ivy">
+    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/build.xml b/nutch-plugins/index-geoip/build.xml
new file mode 100644
index 0000000..92fda82
--- /dev/null
+++ b/nutch-plugins/index-geoip/build.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-geoip" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+  <target name="init-plugin">
+    <echo>Copying MaxMind GeoIP .mmdb files to build</echo>
+    <copy todir="${build.classes}">
+      <fileset dir="${src.dir}" includes="**/*.mmdb" />
+    </copy>
+  </target>
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/ivy.xml b/nutch-plugins/index-geoip/ivy.xml
new file mode 100644
index 0000000..1b626f0
--- /dev/null
+++ b/nutch-plugins/index-geoip/ivy.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" >
+      <!-- Exlude due to classpath issues -->
+      <exclude org="org.apache.httpcomponents" name="httpclient" />
+      <exclude org="org.apache.httpcomponents" name="httpcore" />
+    </dependency>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/plugin.xml b/nutch-plugins/index-geoip/plugin.xml
new file mode 100644
index 0000000..214fbd0
--- /dev/null
+++ b/nutch-plugins/index-geoip/plugin.xml
@@ -0,0 +1,51 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-geoip"
+   name="GeoIP2 Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-geoip.jar">
+         <export name="*"/>
+      </library>
+      <library name="commons-codec-1.6.jar"/>
+      <library name="commons-logging-1.1.1.jar"/>
+      <library name="geoip2-2.3.1.jar"/>
+      <library name="google-http-client-1.20.0.jar"/>
+      <library name="jackson-annotations-2.5.0.jar"/>
+      <library name="jackson-core-2.5.3.jar"/>
+      <library name="jackson-databind-2.5.3.jar"/>
+      <library name="jsr305-1.3.9.jar"/>
+      <library name="maxmind-db-1.0.0.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.geoip"
+              name="Nutch GeoIP2 Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="GeoIPIndexingFilter"
+                      class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
+   </extension>
+
+</plugin>
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/pom.xml b/nutch-plugins/index-geoip/pom.xml
new file mode 100644
index 0000000..1238982
--- /dev/null
+++ b/nutch-plugins/index-geoip/pom.xml
@@ -0,0 +1,55 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-geoip</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-geoip</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>com.maxmind.geoip2</groupId>
+            <artifactId>geoip2</artifactId>
+            <version>2.3.1</version>
+            <exclusions>
+                <exclusion>
+                    <groupId>org.apache.httpcomponents</groupId>
+                    <artifactId>httpclient</artifactId>
+                </exclusion>
+                <exclusion>
+                    <groupId>org.apache.httpcomponents</groupId>
+                    <artifactId>httpcore</artifactId>
+                </exclusion>
+            </exclusions>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
new file mode 100644
index 0000000..88d78ef
--- /dev/null
+++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.UnknownHostException;
+
+import org.apache.nutch.indexer.NutchDocument;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+import com.maxmind.geoip2.exception.GeoIp2Exception;
+import com.maxmind.geoip2.model.InsightsResponse;
+import com.maxmind.geoip2.model.CityResponse;
+import com.maxmind.geoip2.model.ConnectionTypeResponse;
+import com.maxmind.geoip2.model.CountryResponse;
+import com.maxmind.geoip2.model.DomainResponse;
+import com.maxmind.geoip2.model.IspResponse;
+import com.maxmind.geoip2.record.City;
+import com.maxmind.geoip2.record.Continent;
+import com.maxmind.geoip2.record.Country;
+import com.maxmind.geoip2.record.Location;
+import com.maxmind.geoip2.record.Postal;
+import com.maxmind.geoip2.record.RepresentedCountry;
+import com.maxmind.geoip2.record.Subdivision;
+import com.maxmind.geoip2.record.Traits;
+
+/**
+ * <p>
+ * Simple utility class which enables efficient, structured
+ * {@link org.apache.nutch.indexer.NutchDocument} building based on input from
+ * {@link GeoIPIndexingFilter}, where configuration is also read.
+ * </p>
+ * <p>
+ * Based on the nature of the input, this class wraps factory type
+ * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}
+ * 's with the correct {@link org.apache.nutch.indexer.NutchField} information.
+ * 
+ */
+public class GeoIPDocumentCreator {
+
+  /**
+   * Default constructor.
+   */
+  public GeoIPDocumentCreator() {
+  }
+
+  public static NutchDocument createDocFromInsightsService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    doc.add("ip", serverIp);
+    InsightsResponse response = client
+        .insights(InetAddress.getByName(serverIp));
+    // CityResponse response = client.city(InetAddress.getByName(serverIp));
+
+    City city = response.getCity();
+    doc.add("cityName", city.getName()); // 'Minneapolis'
+    doc.add("cityConfidence", city.getConfidence()); // 50
+    doc.add("cityGeoNameId", city.getGeoNameId());
+
+    Continent continent = response.getContinent();
+    doc.add("continentCode", continent.getCode());
+    doc.add("continentGeoNameId", continent.getGeoNameId());
+    doc.add("continentName", continent.getName());
+
+    Country country = response.getCountry();
+    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+    doc.add("countryName", country.getName()); // 'United States'
+    doc.add("countryConfidence", country.getConfidence()); // 99
+    doc.add("countryGeoName", country.getGeoNameId());
+
+    Location location = response.getLocation();
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+                                                                               // -93.2323
+    doc.add("accRadius", location.getAccuracyRadius()); // 3
+    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+    doc.add("metroCode", location.getMetroCode());
+
+    Postal postal = response.getPostal();
+    doc.add("postalCode", postal.getCode()); // '55455'
+    doc.add("postalConfidence", postal.getConfidence()); // 40
+
+    RepresentedCountry rCountry = response.getRepresentedCountry();
+    doc.add("countryType", rCountry.getType());
+
+    Subdivision subdivision = response.getMostSpecificSubdivision();
+    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+
+    Traits traits = response.getTraits();
+    doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
+    doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
+    doc.add("domain", traits.getDomain());
+    doc.add("isp", traits.getIsp());
+    doc.add("org", traits.getOrganization());
+    doc.add("userType", traits.getUserType());
+    doc.add("isAnonProxy", traits.isAnonymousProxy());
+    doc.add("isSatelliteProv", traits.isSatelliteProvider());
+    return doc;
+  }
+
+  @SuppressWarnings("unused")
+  public static NutchDocument createDocFromCityService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    CityResponse response = client.city(InetAddress.getByName(serverIp));
+    return doc;
+  }
+
+  @SuppressWarnings("unused")
+  public static NutchDocument createDocFromCountryService(String serverIp,
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    CountryResponse response = client.country(InetAddress.getByName(serverIp));
+    return doc;
+  }
+
+  public static NutchDocument createDocFromIspDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    IspResponse response = reader.isp(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("autonSystemNum", response.getAutonomousSystemNumber());
+    doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
+    doc.add("isp", response.getIsp());
+    doc.add("org", response.getOrganization());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromDomainDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("domain", response.getDomain());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromConnectionDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    ConnectionTypeResponse response = reader.connectionType(InetAddress
+        .getByName(serverIp));
+    doc.add("ip", serverIp);
+    doc.add("connType", response.getConnectionType().toString());
+    return doc;
+  }
+
+  public static NutchDocument createDocFromCityDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    doc.add("ip", serverIp);
+    CityResponse response = reader.city(InetAddress.getByName(serverIp));
+
+    City city = response.getCity();
+    doc.add("cityName", city.getName()); // 'Minneapolis'
+    doc.add("cityConfidence", city.getConfidence()); // 50
+    doc.add("cityGeoNameId", city.getGeoNameId());
+
+    Continent continent = response.getContinent();
+    doc.add("continentCode", continent.getCode());
+    doc.add("continentGeoNameId", continent.getGeoNameId());
+    doc.add("continentName", continent.getName());
+
+    Country country = response.getCountry();
+    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+    doc.add("countryName", country.getName()); // 'United States'
+    doc.add("countryConfidence", country.getConfidence()); // 99
+    doc.add("countryGeoName", country.getGeoNameId());
+
+    Location location = response.getLocation();
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+                                                                               // -93.2323
+    doc.add("accRadius", location.getAccuracyRadius()); // 3
+    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+    doc.add("metroCode", location.getMetroCode());
+
+    Postal postal = response.getPostal();
+    doc.add("postalCode", postal.getCode()); // '55455'
+    doc.add("postalConfidence", postal.getConfidence()); // 40
+
+    RepresentedCountry rCountry = response.getRepresentedCountry();
+    doc.add("countryType", rCountry.getType());
+
+    Subdivision subdivision = response.getMostSpecificSubdivision();
+    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
+    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
new file mode 100644
index 0000000..f515f1f
--- /dev/null
+++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
@@ -0,0 +1,241 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.geoip;
+
+import java.io.File;
+import java.io.IOException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.maxmind.geoip2.DatabaseReader;
+import com.maxmind.geoip2.WebServiceClient;
+
+/**
+ * <p>
+ * This plugin implements an indexing filter which takes advantage of the <a
+ * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.
+ * </p>
+ * <p>
+ * The third party library distribution provides an API for the GeoIP2 <a
+ * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
+ * services</a> and <a
+ * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The
+ * API also works with the free <a
+ * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
+ * </p>
+ * <p>
+ * Depending on the service level agreement, you have with the GeoIP service
+ * provider, the plugin can add a number of the following fields to the index
+ * data model:
+ * <ol>
+ * <li>Continent</li>
+ * <li>Country</li>
+ * <li>Regional Subdivision</li>
+ * <li>City</li>
+ * <li>Postal Code</li>
+ * <li>Latitude/Longitude</li>
+ * <li>ISP/Organization</li>
+ * <li>AS Number</li>
+ * <li>Confidence Factors</li>
+ * <li>Radius</li>
+ * <li>User Type</li>
+ * </ol>
+ * </p>
+ * 
+ * <p>
+ * Some of the services are documented at the <a
+ * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision
+ * Services</a> webpage where more information can be obtained.
+ * </p>
+ * 
+ * <p>
+ * You should also consult the following three properties in
+ * <code>nutch-site.xml</code>
+ * </p>
+ * 
+ * <pre>
+ *  {@code
+ * <!-- index-geoip plugin properties -->
+ * <property>
+ *   <name>index.geoip.usage</name>
+ *   <value>insightsService</value>
+ *   <description>
+ *   A string representing the information source to be used for GeoIP information
+ *   association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
+ *   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
+ *   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
+ *   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
+ *   and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ *   </description>
+ * </property>
+ * 
+ * <property>
+ *   <name>index.geoip.userid</name>
+ *   <value></value>
+ *   <description>
+ *   The userId associated with the GeoIP2 Precision Services account.
+ *   </description>
+ * </property>
+ * 
+ * <property>
+ *   <name>index.geoip.licensekey</name>
+ *   <value></value>
+ *   <description>
+ *   The license key associated with the GeoIP2 Precision Services account.
+ *   </description>
+ * </property>
+ * }
+ * </pre>
+ * 
+ */
+public class GeoIPIndexingFilter implements IndexingFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(GeoIPIndexingFilter.class);
+
+  private Configuration conf;
+
+  private String usage = null;
+
+  private File geoDb = null;
+
+  WebServiceClient client = null;
+
+  DatabaseReader reader = null;
+
+  // private AbstractResponse response = null;
+
+  /**
+   * Default constructor for this plugin
+   */
+  public GeoIPIndexingFilter() {
+  }
+
+  /**
+   * @see org.apache.hadoop.conf.Configurable#getConf()
+   */
+  @Override
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
+   */
+  @Override
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    String use = conf.get("index.geoip.usage", "insightsService");
+    LOG.debug("GeoIP usage medium set to: {}", use);
+    if (use.equalsIgnoreCase("cityDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
+            .getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("domainDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("ispDatabase")) {
+      try {
+        geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
+        buildDb();
+      } catch (Exception e) {
+        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+      }
+    } else if (use.equalsIgnoreCase("insightsService")) {
+      client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
+          12345), conf.get("index.geoip.licensekey")).build();
+    }
+    usage = use;
+  }
+
+  private void buildDb() {
+    try {
+      reader = new DatabaseReader.Builder(geoDb).build();
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  /**
+   * 
+   * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
+   *      org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
+   *      org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
+   */
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    return addServerGeo(doc, parse.getData(), url.toString());
+  }
+
+  private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
+      String url) {
+
+    if (conf.getBoolean("store.ip.address", false) == true) {
+      try {
+        String serverIp = data.getContentMeta().get("_ip_");
+        if (serverIp != null) {
+          if (usage.equalsIgnoreCase("cityDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
+                reader);
+          } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
+                reader);
+          } else if (usage.equalsIgnoreCase("domainDatabase")) {
+            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
+                reader);
+          } else if (usage.equalsIgnoreCase("ispDatabase")) {
+            doc = GeoIPDocumentCreator
+                .createDocFromIspDb(serverIp, doc, reader);
+          } else if (usage.equalsIgnoreCase("insightsService")) {
+            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
+                doc, client);
+          }
+        }
+      } catch (Exception e) {
+        LOG.error(e.getMessage());
+        e.printStackTrace();
+      }
+    }
+    return doc;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java
new file mode 100644
index 0000000..ba62519
--- /dev/null
+++ b/nutch-plugins/index-geoip/src/main/java/org/apache/nutch/indexer/geoip/package-info.java
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * <p>This plugin implements an indexing filter which takes 
+ * advantage of the 
+ * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
+ * <p>The third party library distribution provides an API for the GeoIP2 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> 
+ * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. 
+ * The API also works with the free 
+ * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
+ *
+ */
+package org.apache.nutch.indexer.geoip;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/build.xml b/nutch-plugins/index-links/build.xml
new file mode 100644
index 0000000..b853ccf
--- /dev/null
+++ b/nutch-plugins/index-links/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-links" default="jar-core">
+
+    <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/ivy.xml b/nutch-plugins/index-links/ivy.xml
new file mode 100644
index 0000000..0a363f7
--- /dev/null
+++ b/nutch-plugins/index-links/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/plugin.xml b/nutch-plugins/index-links/plugin.xml
new file mode 100644
index 0000000..dfdc5d2
--- /dev/null
+++ b/nutch-plugins/index-links/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+    id="index-links"
+    name="Index inlinks and outlinks"
+    version="1.0.0"
+    provider-name="nutch.org">
+
+    <runtime>
+        <library name="index-links.jar">
+            <export name="*"/>
+        </library>
+    </runtime>
+
+    <requires>
+        <import plugin="nutch-extensionpoints"/>
+    </requires>
+
+    <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+               name="Links indexing filter"
+               point="org.apache.nutch.indexer.IndexingFilter">
+        <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter"
+                        class="org.apache.nutch.indexer.links.LinksIndexingFilter"/>
+    </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/pom.xml b/nutch-plugins/index-links/pom.xml
new file mode 100644
index 0000000..e5e3a7f
--- /dev/null
+++ b/nutch-plugins/index-links/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-links</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-links</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
new file mode 100644
index 0000000..975df66
--- /dev/null
+++ b/nutch-plugins/index-links/src/main/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
@@ -0,0 +1,167 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.slf4j.LoggerFactory;
+
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
+ * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
+ *
+ * In case that you want to ignore the outlinks that point to the same host
+ * as the URL being indexed use the following settings in your configuration
+ * file:
+ *
+ * <property>
+ *   <name>index.links.outlinks.host.ignore</name>
+ *   <value>true</value>
+ * </property>
+ *
+ * The same configuration is available for inlinks:
+ *
+ * <property>
+ *   <name>index.links.inlinks.host.ignore</name>
+ *   <value>true</value>
+ * </property>
+ *
+ * To store only the host portion of each inlink URL or outlink URL add the
+ * following to your configuration file.
+ *
+ * <property>
+ *   <name>index.links.hosts.only</name>
+ *   <value>false</value>
+ * </property>
+ *
+ */
+public class LinksIndexingFilter implements IndexingFilter {
+
+  public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore";
+  public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore";
+  public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
+
+  public final static org.slf4j.Logger LOG = LoggerFactory
+      .getLogger(LinksIndexingFilter.class);
+
+  private Configuration conf;
+  private boolean filterOutlinks;
+  private boolean filterInlinks;
+  private boolean indexHost;
+
+  @Override
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // Add the outlinks
+    Outlink[] outlinks = parse.getData().getOutlinks();
+
+    if (outlinks != null) {
+      Set<String> hosts = new HashSet<String>();
+
+      for (Outlink outlink : outlinks) {
+        try {
+          String linkUrl = outlink.getToUrl();
+          String outHost = new URL(linkUrl).getHost().toLowerCase();
+
+          if (indexHost) {
+            linkUrl = outHost;
+
+            if (hosts.contains(linkUrl))
+              continue;
+
+            hosts.add(linkUrl);
+          }
+
+          addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
+              filterOutlinks, doc);
+        } catch (MalformedURLException e) {
+          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+        }
+      }
+    }
+
+    // Add the inlinks
+    if (null != inlinks) {
+      Iterator<Inlink> iterator = inlinks.iterator();
+      Set<String> inlinkHosts = new HashSet<String>();
+
+      while (iterator.hasNext()) {
+        try {
+          Inlink link = iterator.next();
+          String linkUrl = link.getFromUrl();
+          String inHost = new URL(linkUrl).getHost().toLowerCase();
+
+          if (indexHost) {
+            linkUrl = inHost;
+
+            if (inlinkHosts.contains(linkUrl))
+              continue;
+
+            inlinkHosts.add(linkUrl);
+          }
+
+          addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
+              filterInlinks, doc);
+        } catch (MalformedURLException e) {
+          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  private void addFilteredLink(String fieldName, String url, String linkUrl,
+      String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException {
+      if (filter) {
+        String host = new URL(url.toString()).getHost().toLowerCase();
+
+        if (!host.equalsIgnoreCase(urlHost)) {
+          doc.add(fieldName, linkUrl);
+        }
+      } else {
+        doc.add(fieldName, linkUrl);
+      }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
+    filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
+
+    indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
new file mode 100644
index 0000000..c490d1f
--- /dev/null
+++ b/nutch-plugins/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
@@ -0,0 +1,218 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.links;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlink;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.parse.ParseStatus;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.net.URL;
+import java.util.Iterator;
+
+public class TestLinksIndexingFilter {
+
+  Configuration conf = NutchConfiguration.create();
+  LinksIndexingFilter filter = new LinksIndexingFilter();
+  Metadata metadata = new Metadata();
+
+  @Before
+  public void setUp() throws Exception {
+    metadata.add(Response.CONTENT_TYPE, "text/html");
+  }
+
+  private Outlink[] generateOutlinks() throws Exception {
+    return generateOutlinks(false);
+  }
+
+  private Outlink[] generateOutlinks(boolean parts) throws Exception {
+    Outlink[] outlinks = new Outlink[2];
+
+    outlinks[0] = new Outlink("http://www.test.com", "test");
+    outlinks[1] = new Outlink("http://www.example.com", "example");
+
+    if (parts) {
+      outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
+          "test");
+      outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
+          "test");
+    }
+
+    return outlinks;
+  }
+
+  @Test
+  public void testFilterOutlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals("Filter outlinks, allow only those from a different host",
+        outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com", "test"));
+    inlinks.add(new Inlink("http://www.example.com", "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Filter inlinks, allow only those from a different host",
+        "http://www.test.com", doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testNoFilterOutlinks() throws Exception {
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks();
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals("All outlinks must be indexed even those from the same host",
+        outlinks.length, doc.getField("outlinks").getValues().size());
+  }
+
+  @Test
+  public void testNoFilterInlinks() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com", "test"));
+    inlinks.add(new Inlink("http://www.example.com", "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals("All inlinks must be indexed even those from the same host",
+        inlinks.size(), doc.getField("inlinks").getValues().size());
+  }
+
+  @Test
+  public void testIndexOnlyHostPart() throws Exception {
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    filter.setConf(conf);
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
+    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
+    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
+        "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    NutchField docOutlinks = doc.getField("outlinks");
+
+    Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
+        new URL("http://www.test.com").getHost(),
+        docOutlinks.getValues().get(0));
+
+    Assert.assertEquals(
+        "The inlinks coming from the same host must count only once", 1,
+        doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
+        new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
+
+    Outlink[] outlinks = generateOutlinks(true);
+
+    filter.setConf(conf);
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
+
+    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the outlinks after filtering",
+        new URL("http://www.test.com").getHost(),
+        doc.getFieldValue("outlinks"));
+  }
+
+  @Test
+  public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
+    conf = NutchConfiguration.create();
+    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
+    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
+
+    filter.setConf(conf);
+
+    Inlinks inlinks = new Inlinks();
+    inlinks.add(new Inlink("http://www.test.com", "test"));
+    inlinks.add(new Inlink("http://www.example.com", "example"));
+
+    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
+            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
+        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
+
+    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
+
+    Assert.assertEquals(
+        "Index only the host portion of the inlinks after filtering",
+        new URL("http://www.test.com").getHost(),
+        doc.getFieldValue("inlinks"));
+
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java b/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
new file mode 100644
index 0000000..aaaedbf
--- /dev/null
+++ b/nutch-plugins/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.*;
+
+public class TestOutlinks {
+
+  @Test
+  public void testAddSameObject() throws Exception {
+    Set<Outlink> set = new HashSet<>();
+
+    Outlink o = new Outlink("http://www.example.com", "Example");
+    set.add(o);
+    set.add(o);
+
+    assertEquals("Adding the same Outlink twice", 1, set.size());
+  }
+
+  @Test
+  public void testAddOtherObjectWithSameData() throws Exception {
+    Set<Outlink> set = new HashSet<>();
+
+    Outlink o = new Outlink("http://www.example.com", "Example");
+    Outlink o1 = new Outlink("http://www.example.com", "Example");
+
+    assertTrue("The two Outlink objects are the same", o.equals(o1));
+
+    set.add(o);
+    set.add(o1);
+
+    assertEquals("The set should contain only 1 Outlink", 1, set.size());
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-metadata/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/build.xml b/nutch-plugins/index-metadata/build.xml
new file mode 100644
index 0000000..ad96d11
--- /dev/null
+++ b/nutch-plugins/index-metadata/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-metadata" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-metadata/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/ivy.xml b/nutch-plugins/index-metadata/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/index-metadata/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-metadata/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/plugin.xml b/nutch-plugins/index-metadata/plugin.xml
new file mode 100644
index 0000000..4d4c9a7
--- /dev/null
+++ b/nutch-plugins/index-metadata/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-metadata"
+   name="Index Metadata"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+    <runtime>
+      <library name="index-metadata.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+
+   <extension id="org.apache.nutch.indexer.metadata"
+              name="Nutch metadata indexer"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="MetadataIndexer"
+                      class="org.apache.nutch.indexer.metadata.MetadataIndexer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-metadata/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/pom.xml b/nutch-plugins/index-metadata/pom.xml
new file mode 100644
index 0000000..bef1b9a
--- /dev/null
+++ b/nutch-plugins/index-metadata/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-metadata</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-metadata</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
new file mode 100644
index 0000000..78718aa
--- /dev/null
+++ b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.metadata;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * Indexer which can be configured to extract metadata from the crawldb, parse
+ * metadata or content metadata. You can specify the properties "index.db.md",
+ * "index.parse.md" or "index.content.md" who's values are comma-delimited
+ * <value>key1,key2,key3</value>.
+ */
+public class MetadataIndexer implements IndexingFilter {
+  private Configuration conf;
+  private String[] dbFieldnames;
+  private Map<String, String> parseFieldnames;
+  private String[] contentFieldnames;
+  private static final String db_CONF_PROPERTY = "index.db.md";
+  private static final String parse_CONF_PROPERTY = "index.parse.md";
+  private static final String content_CONF_PROPERTY = "index.content.md";
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    // just in case
+    if (doc == null)
+      return doc;
+
+    // add the fields from crawldb
+    if (dbFieldnames != null) {
+      for (String metatag : dbFieldnames) {
+        Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+        if (metadata != null)
+          doc.add(metatag, metadata.toString());
+      }
+    }
+
+    // add the fields from parsemd
+    if (parseFieldnames != null) {
+      for (String metatag : parseFieldnames.keySet()) {
+        for (String value : parse.getData().getParseMeta().getValues(metatag)) {
+          if (value != null)
+            doc.add(parseFieldnames.get(metatag), value);
+        }
+      }
+    }
+
+    // add the fields from contentmd
+    if (contentFieldnames != null) {
+      for (String metatag : contentFieldnames) {
+        for (String value : parse.getData().getContentMeta().getValues(metatag)) {
+          if (value != null)
+            doc.add(metatag, value);
+        }
+      }
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
+    parseFieldnames = new HashMap<String, String>();
+    for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
+      parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
+    }
+    contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
+
+    // TODO check conflict between field names e.g. could have same label
+    // from different sources
+
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
new file mode 100644
index 0000000..8f2bee5
--- /dev/null
+++ b/nutch-plugins/index-metadata/src/main/java/org/apache/nutch/indexer/metadata/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to add document metadata to the index.
+ * Metadata may come from CrawlDb, parse or content metadata.
+ */
+package org.apache.nutch.indexer.metadata;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/build.xml b/nutch-plugins/index-more/build.xml
new file mode 100644
index 0000000..dec1e12
--- /dev/null
+++ b/nutch-plugins/index-more/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="index-more" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/ivy.xml b/nutch-plugins/index-more/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/index-more/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/plugin.xml b/nutch-plugins/index-more/plugin.xml
new file mode 100644
index 0000000..d920f72
--- /dev/null
+++ b/nutch-plugins/index-more/plugin.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="index-more"
+   name="More Indexing Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="index-more.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.more"
+              name="Nutch More Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="MoreIndexingFilter"
+                      class="org.apache.nutch.indexer.more.MoreIndexingFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/index-more/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/index-more/pom.xml b/nutch-plugins/index-more/pom.xml
new file mode 100644
index 0000000..80e5de0
--- /dev/null
+++ b/nutch-plugins/index-more/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>index-more</artifactId>
+    <packaging>jar</packaging>
+
+    <name>index-more</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>