You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:05 UTC

[21/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/ivy.xml b/src/plugin/index-anchor/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/index-anchor/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/plugin.xml b/src/plugin/index-anchor/plugin.xml
deleted file mode 100644
index 208594b..0000000
--- a/src/plugin/index-anchor/plugin.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
--->
-<plugin id="index-anchor" name="Anchor Indexing Filter" version="1.0.0"
-  provider-name="nutch.org">
-
-  <runtime>
-    <library name="index-anchor.jar">
-      <export name="*" />
-    </library>
-  </runtime>
-
-  <requires>
-    <import plugin="nutch-extensionpoints" />
-  </requires>
-
-  <extension id="org.apache.nutch.indexer.anchor"
-    name="Nutch Anchor Indexing Filter"
-    point="org.apache.nutch.indexer.IndexingFilter">
-    <implementation id="AnchorIndexingFilter"
-      class="org.apache.nutch.indexer.anchor.AnchorIndexingFilter" />
-  </extension>
-
-</plugin>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
deleted file mode 100644
index 6c9b834..0000000
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.anchor;
-
-import java.util.HashSet;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-/**
- * Indexing filter that offers an option to either index all inbound anchor text
- * for a document or deduplicate anchors. Deduplication does have it's con's,
- * 
- * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
- */
-public class AnchorIndexingFilter implements IndexingFilter {
-
-  public static final Logger LOG = LoggerFactory
-      .getLogger(AnchorIndexingFilter.class);
-  private Configuration conf;
-  private boolean deduplicate = false;
-
-  /**
-   * Set the {@link Configuration} object
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-
-    deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
-    LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
-  }
-
-  /**
-   * Get the {@link Configuration} object
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * The {@link AnchorIndexingFilter} filter object which supports boolean
-   * configuration settings for the deduplication of anchors. See
-   * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
-   * 
-   * @param doc
-   *          The {@link NutchDocument} object
-   * @param parse
-   *          The relevant {@link Parse} object passing through the filter
-   * @param url
-   *          URL to be filtered for anchor text
-   * @param datum
-   *          The {@link CrawlDatum} entry
-   * @param inlinks
-   *          The {@link Inlinks} containing anchor text
-   * @return filtered NutchDocument
-   */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
-
-    HashSet<String> set = null;
-
-    for (int i = 0; i < anchors.length; i++) {
-      if (deduplicate) {
-        if (set == null)
-          set = new HashSet<String>();
-        String lcAnchor = anchors[i].toLowerCase();
-
-        // Check if already processed the current anchor
-        if (!set.contains(lcAnchor)) {
-          doc.add("anchor", anchors[i]);
-
-          // Add to map
-          set.add(lcAnchor);
-        }
-      } else {
-        doc.add("anchor", anchors[i]);
-      }
-    }
-
-    return doc;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html
deleted file mode 100644
index c255029..0000000
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>An indexing plugin for inbound anchor text.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java b/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
deleted file mode 100644
index 08a42f3..0000000
--- a/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.anchor;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-/**
- * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
- * deduplication functionality is working
- * 
- * @author lewismc
- * 
- */
-public class TestAnchorIndexingFilter {
-
-  @Test
-  public void testDeduplicateAnchor() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
-    AnchorIndexingFilter filter = new AnchorIndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-    NutchDocument doc = new NutchDocument();
-    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://test1.com/", "text1"));
-    inlinks.add(new Inlink("http://test2.com/", "text2"));
-    inlinks.add(new Inlink("http://test3.com/", "text2"));
-    try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
-          new CrawlDatum(), inlinks);
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    Assert.assertNotNull(doc);
-    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
-        .contains("anchor"));
-    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
-        .getValues().size());
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/build.xml b/src/plugin/index-basic/build.xml
deleted file mode 100755
index a834290..0000000
--- a/src/plugin/index-basic/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-basic" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/ivy.xml b/src/plugin/index-basic/ivy.xml
deleted file mode 100644
index 848216e..0000000
--- a/src/plugin/index-basic/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-      <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/plugin.xml b/src/plugin/index-basic/plugin.xml
deleted file mode 100755
index c5d784d..0000000
--- a/src/plugin/index-basic/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="index-basic"
-   name="Basic Indexing Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="index-basic.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.indexer.basic"
-              name="Nutch Basic Indexing Filter"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="BasicIndexingFilter"
-                      class="org.apache.nutch.indexer.basic.BasicIndexingFilter"/>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
deleted file mode 100644
index 8584fa8..0000000
--- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.basic;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.nutch.metadata.Nutch;
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.URLUtil;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Date;
-
-import org.apache.hadoop.conf.Configuration;
-
-/**
- * Adds basic searchable fields to a document. The fields added are : domain,
- * host, url, content, title, cache, tstamp domain is included depending on
- * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per
- * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a
- * zero-length title is not added) content is truncated as per
- * {@code indexer.max.content.length} in nutch-default.xml.
- */
-public class BasicIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory
-      .getLogger(BasicIndexingFilter.class);
-
-  private int MAX_TITLE_LENGTH;
-  private int MAX_CONTENT_LENGTH;
-  private boolean addDomain = false;
-  private Configuration conf;
-
-  /**
-   * The {@link BasicIndexingFilter} filter object which supports few
-   * configuration settings for adding basic searchable fields. See
-   * {@code indexer.add.domain}, {@code indexer.max.title.length},
-   * {@code indexer.max.content.length} in nutch-default.xml.
-   * 
-   * @param doc
-   *          The {@link NutchDocument} object
-   * @param parse
-   *          The relevant {@link Parse} object passing through the filter
-   * @param url
-   *          URL to be filtered for anchor text
-   * @param datum
-   *          The {@link CrawlDatum} entry
-   * @param inlinks
-   *          The {@link Inlinks} containing anchor text
-   * @return filtered NutchDocument
-   */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
-    String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
-    String urlString = url.toString();
-
-    String host = null;
-    try {
-      URL u;
-      if (reprUrlString != null) {
-        u = new URL(reprUrlString);
-      } else {
-        u = new URL(urlString);
-      }
-
-      if (addDomain) {
-        doc.add("domain", URLUtil.getDomainName(u));
-      }
-
-      host = u.getHost();
-    } catch (MalformedURLException e) {
-      throw new IndexingException(e);
-    }
-
-    if (host != null) {
-      doc.add("host", host);
-    }
-
-    doc.add("url", reprUrlString == null ? urlString : reprUrlString);
-
-    // content
-    String content = parse.getText();
-    if (MAX_CONTENT_LENGTH > -1 && content.length() > MAX_CONTENT_LENGTH) {
-      content = content.substring(0, MAX_CONTENT_LENGTH);
-    }
-    doc.add("content", StringUtil.cleanField(content));
-
-    // title
-    String title = parse.getData().getTitle();
-    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
-                                                                      // title
-                                                                      // if
-                                                                      // needed
-      title = title.substring(0, MAX_TITLE_LENGTH);
-    }
-
-    if (title.length() > 0) {
-      // NUTCH-1004 Do not index empty values for title field
-      doc.add("title", StringUtil.cleanField(title));
-    }
-
-    // add cached content/summary display policy, if available
-    String caching = parse.getData().getMeta(Nutch.CACHING_FORBIDDEN_KEY);
-    if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
-      doc.add("cache", caching);
-    }
-
-    // add timestamp when fetched, for deduplication
-    doc.add("tstamp", new Date(datum.getFetchTime()));
-
-    return doc;
-  }
-
-  /**
-   * Set the {@link Configuration} object
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
-    this.addDomain = conf.getBoolean("indexer.add.domain", false);
-    this.MAX_CONTENT_LENGTH = conf.getInt("indexer.max.content.length", -1);
-  }
-
-  /**
-   * Get the {@link Configuration} object
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html
deleted file mode 100644
index 3fae405..0000000
--- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>A basic indexing plugin, adds basic fields: url, host, title, content, etc.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java b/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
deleted file mode 100644
index 4bc317e..0000000
--- a/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.basic;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.basic.BasicIndexingFilter;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.util.Date;
-
-/**
- * JUnit test case which tests 1. that basic searchable fields are added to a
- * document 2. that domain is added as per {@code indexer.add.domain} in
- * nutch-default.xml. 3. that title is truncated as per
- * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
- * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
- * 
- * @author tejasp
- * 
- */
-
-public class TestBasicIndexingFilter {
-
-  @Test
-  public void testBasicIndexingFilter() throws Exception {
-    Configuration conf = NutchConfiguration.create();
-    conf.setInt("indexer.max.title.length", 10);
-    conf.setBoolean("indexer.add.domain", true);
-    conf.setInt("indexer.max.content.length", 20);
-
-    BasicIndexingFilter filter = new BasicIndexingFilter();
-    filter.setConf(conf);
-    Assert.assertNotNull(filter);
-
-    NutchDocument doc = new NutchDocument();
-
-    String title = "The Foo Page";
-    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
-    Metadata metaData = new Metadata();
-    metaData.add("Language", "en/us");
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
-        outlinks, metaData);
-    ParseImpl parse = new ParseImpl(
-        "this is a sample foo bar page. hope you enjoy it.", parseData);
-
-    CrawlDatum crawlDatum = new CrawlDatum();
-    crawlDatum.setFetchTime(100L);
-
-    Inlinks inlinks = new Inlinks();
-
-    try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
-          crawlDatum, inlinks);
-    } catch (Exception e) {
-      e.printStackTrace();
-      Assert.fail(e.getMessage());
-    }
-    Assert.assertNotNull(doc);
-    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
-        .getField("title").getValues().get(0));
-    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
-        .getField("domain").getValues().get(0));
-    Assert.assertEquals("test host, expect \"nutch.apache.org\"",
-        "nutch.apache.org", doc.getField("host").getValues().get(0));
-    Assert.assertEquals(
-        "test url, expect \"http://nutch.apache.org/index.html\"",
-        "http://nutch.apache.org/index.html", doc.getField("url").getValues()
-            .get(0));
-    Assert.assertEquals("test content", "this is a sample foo",
-        doc.getField("content").getValues().get(0));
-    Assert.assertEquals("test fetch time", new Date(100L),
-        (Date) doc.getField("tstamp").getValues().get(0));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/build-ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/build-ivy.xml b/src/plugin/index-geoip/build-ivy.xml
deleted file mode 100644
index 2cda7e9..0000000
--- a/src/plugin/index-geoip/build-ivy.xml
+++ /dev/null
@@ -1,54 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-geoip" default="deps-jar" xmlns:ivy="antlib:org.apache.ivy.ant">
-
-    <property name="ivy.install.version" value="2.1.0" />
-    <condition property="ivy.home" value="${env.IVY_HOME}">
-      <isset property="env.IVY_HOME" />
-    </condition>
-    <property name="ivy.home" value="${user.home}/.ant" />
-    <property name="ivy.checksums" value="" />
-    <property name="ivy.jar.dir" value="${ivy.home}/lib" />
-    <property name="ivy.jar.file" value="${ivy.jar.dir}/ivy.jar" />
-
-    <target name="download-ivy" unless="offline">
-
-        <mkdir dir="${ivy.jar.dir}"/>
-        <!-- download Ivy from web site so that it can be used even without any special installation -->
-        <get src="http://repo2.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/ivy-${ivy.install.version}.jar" 
-             dest="${ivy.jar.file}" usetimestamp="true"/>
-    </target>
-
-    <target name="init-ivy" depends="download-ivy">
-      <!-- try to load ivy here from ivy home, in case the user has not already dropped
-              it into ant's lib dir (note that the latter copy will always take precedence).
-              We will not fail as long as local lib dir exists (it may be empty) and
-              ivy is in at least one of ant's lib dir or the local lib dir. -->
-        <path id="ivy.lib.path">
-            <fileset dir="${ivy.jar.dir}" includes="*.jar"/>
-
-        </path>
-        <taskdef resource="org/apache/ivy/ant/antlib.xml"
-                 uri="antlib:org.apache.ivy.ant" classpathref="ivy.lib.path"/>
-    </target>
-
-  <target name="deps-jar" depends="init-ivy">
-    <ivy:retrieve pattern="lib/[artifact]-[revision].[ext]"/>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/build.xml b/src/plugin/index-geoip/build.xml
deleted file mode 100644
index 92fda82..0000000
--- a/src/plugin/index-geoip/build.xml
+++ /dev/null
@@ -1,27 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-geoip" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-  <target name="init-plugin">
-    <echo>Copying MaxMind GeoIP .mmdb files to build</echo>
-    <copy todir="${build.classes}">
-      <fileset dir="${src.dir}" includes="**/*.mmdb" />
-    </copy>
-  </target>
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/ivy.xml b/src/plugin/index-geoip/ivy.xml
deleted file mode 100644
index 1b626f0..0000000
--- a/src/plugin/index-geoip/ivy.xml
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="com.maxmind.geoip2" name="geoip2" rev="2.3.1" >
-      <!-- Exlude due to classpath issues -->
-      <exclude org="org.apache.httpcomponents" name="httpclient" />
-      <exclude org="org.apache.httpcomponents" name="httpcore" />
-    </dependency>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/plugin.xml b/src/plugin/index-geoip/plugin.xml
deleted file mode 100644
index 214fbd0..0000000
--- a/src/plugin/index-geoip/plugin.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="index-geoip"
-   name="GeoIP2 Indexing Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="index-geoip.jar">
-         <export name="*"/>
-      </library>
-      <library name="commons-codec-1.6.jar"/>
-      <library name="commons-logging-1.1.1.jar"/>
-      <library name="geoip2-2.3.1.jar"/>
-      <library name="google-http-client-1.20.0.jar"/>
-      <library name="jackson-annotations-2.5.0.jar"/>
-      <library name="jackson-core-2.5.3.jar"/>
-      <library name="jackson-databind-2.5.3.jar"/>
-      <library name="jsr305-1.3.9.jar"/>
-      <library name="maxmind-db-1.0.0.jar"/>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.indexer.geoip"
-              name="Nutch GeoIP2 Indexing Filter"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="GeoIPIndexingFilter"
-                      class="org.apache.nutch.indexer.geoip.GeoIPIndexingFilter"/>
-   </extension>
-
-</plugin>
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
deleted file mode 100644
index 88d78ef..0000000
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ /dev/null
@@ -1,210 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.geoip;
-
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.UnknownHostException;
-
-import org.apache.nutch.indexer.NutchDocument;
-
-import com.maxmind.geoip2.DatabaseReader;
-import com.maxmind.geoip2.WebServiceClient;
-import com.maxmind.geoip2.exception.GeoIp2Exception;
-import com.maxmind.geoip2.model.InsightsResponse;
-import com.maxmind.geoip2.model.CityResponse;
-import com.maxmind.geoip2.model.ConnectionTypeResponse;
-import com.maxmind.geoip2.model.CountryResponse;
-import com.maxmind.geoip2.model.DomainResponse;
-import com.maxmind.geoip2.model.IspResponse;
-import com.maxmind.geoip2.record.City;
-import com.maxmind.geoip2.record.Continent;
-import com.maxmind.geoip2.record.Country;
-import com.maxmind.geoip2.record.Location;
-import com.maxmind.geoip2.record.Postal;
-import com.maxmind.geoip2.record.RepresentedCountry;
-import com.maxmind.geoip2.record.Subdivision;
-import com.maxmind.geoip2.record.Traits;
-
-/**
- * <p>
- * Simple utility class which enables efficient, structured
- * {@link org.apache.nutch.indexer.NutchDocument} building based on input from
- * {@link GeoIPIndexingFilter}, where configuration is also read.
- * </p>
- * <p>
- * Based on the nature of the input, this class wraps factory type
- * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}
- * 's with the correct {@link org.apache.nutch.indexer.NutchField} information.
- * 
- */
-public class GeoIPDocumentCreator {
-
-  /**
-   * Default constructor.
-   */
-  public GeoIPDocumentCreator() {
-  }
-
-  public static NutchDocument createDocFromInsightsService(String serverIp,
-      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    doc.add("ip", serverIp);
-    InsightsResponse response = client
-        .insights(InetAddress.getByName(serverIp));
-    // CityResponse response = client.city(InetAddress.getByName(serverIp));
-
-    City city = response.getCity();
-    doc.add("cityName", city.getName()); // 'Minneapolis'
-    doc.add("cityConfidence", city.getConfidence()); // 50
-    doc.add("cityGeoNameId", city.getGeoNameId());
-
-    Continent continent = response.getContinent();
-    doc.add("continentCode", continent.getCode());
-    doc.add("continentGeoNameId", continent.getGeoNameId());
-    doc.add("continentName", continent.getName());
-
-    Country country = response.getCountry();
-    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
-    doc.add("countryName", country.getName()); // 'United States'
-    doc.add("countryConfidence", country.getConfidence()); // 99
-    doc.add("countryGeoName", country.getGeoNameId());
-
-    Location location = response.getLocation();
-    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
-                                                                               // -93.2323
-    doc.add("accRadius", location.getAccuracyRadius()); // 3
-    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
-    doc.add("metroCode", location.getMetroCode());
-
-    Postal postal = response.getPostal();
-    doc.add("postalCode", postal.getCode()); // '55455'
-    doc.add("postalConfidence", postal.getConfidence()); // 40
-
-    RepresentedCountry rCountry = response.getRepresentedCountry();
-    doc.add("countryType", rCountry.getType());
-
-    Subdivision subdivision = response.getMostSpecificSubdivision();
-    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
-    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
-    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
-    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
-
-    Traits traits = response.getTraits();
-    doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
-    doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
-    doc.add("domain", traits.getDomain());
-    doc.add("isp", traits.getIsp());
-    doc.add("org", traits.getOrganization());
-    doc.add("userType", traits.getUserType());
-    doc.add("isAnonProxy", traits.isAnonymousProxy());
-    doc.add("isSatelliteProv", traits.isSatelliteProvider());
-    return doc;
-  }
-
-  @SuppressWarnings("unused")
-  public static NutchDocument createDocFromCityService(String serverIp,
-      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    CityResponse response = client.city(InetAddress.getByName(serverIp));
-    return doc;
-  }
-
-  @SuppressWarnings("unused")
-  public static NutchDocument createDocFromCountryService(String serverIp,
-      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    CountryResponse response = client.country(InetAddress.getByName(serverIp));
-    return doc;
-  }
-
-  public static NutchDocument createDocFromIspDb(String serverIp,
-      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    IspResponse response = reader.isp(InetAddress.getByName(serverIp));
-    doc.add("ip", serverIp);
-    doc.add("autonSystemNum", response.getAutonomousSystemNumber());
-    doc.add("autonSystemOrg", response.getAutonomousSystemOrganization());
-    doc.add("isp", response.getIsp());
-    doc.add("org", response.getOrganization());
-    return doc;
-  }
-
-  public static NutchDocument createDocFromDomainDb(String serverIp,
-      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
-    doc.add("ip", serverIp);
-    doc.add("domain", response.getDomain());
-    return doc;
-  }
-
-  public static NutchDocument createDocFromConnectionDb(String serverIp,
-      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    ConnectionTypeResponse response = reader.connectionType(InetAddress
-        .getByName(serverIp));
-    doc.add("ip", serverIp);
-    doc.add("connType", response.getConnectionType().toString());
-    return doc;
-  }
-
-  public static NutchDocument createDocFromCityDb(String serverIp,
-      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
-      IOException, GeoIp2Exception {
-    doc.add("ip", serverIp);
-    CityResponse response = reader.city(InetAddress.getByName(serverIp));
-
-    City city = response.getCity();
-    doc.add("cityName", city.getName()); // 'Minneapolis'
-    doc.add("cityConfidence", city.getConfidence()); // 50
-    doc.add("cityGeoNameId", city.getGeoNameId());
-
-    Continent continent = response.getContinent();
-    doc.add("continentCode", continent.getCode());
-    doc.add("continentGeoNameId", continent.getGeoNameId());
-    doc.add("continentName", continent.getName());
-
-    Country country = response.getCountry();
-    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
-    doc.add("countryName", country.getName()); // 'United States'
-    doc.add("countryConfidence", country.getConfidence()); // 99
-    doc.add("countryGeoName", country.getGeoNameId());
-
-    Location location = response.getLocation();
-    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
-                                                                               // -93.2323
-    doc.add("accRadius", location.getAccuracyRadius()); // 3
-    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
-    doc.add("metroCode", location.getMetroCode());
-
-    Postal postal = response.getPostal();
-    doc.add("postalCode", postal.getCode()); // '55455'
-    doc.add("postalConfidence", postal.getConfidence()); // 40
-
-    RepresentedCountry rCountry = response.getRepresentedCountry();
-    doc.add("countryType", rCountry.getType());
-
-    Subdivision subdivision = response.getMostSpecificSubdivision();
-    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
-    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
-    doc.add("subDivConfidence", subdivision.getConfidence()); // 90
-    doc.add("subDivGeoNameId", subdivision.getGeoNameId());
-    return doc;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
deleted file mode 100644
index f515f1f..0000000
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
+++ /dev/null
@@ -1,241 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.geoip;
-
-import java.io.File;
-import java.io.IOException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.maxmind.geoip2.DatabaseReader;
-import com.maxmind.geoip2.WebServiceClient;
-
-/**
- * <p>
- * This plugin implements an indexing filter which takes advantage of the <a
- * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.
- * </p>
- * <p>
- * The third party library distribution provides an API for the GeoIP2 <a
- * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
- * services</a> and <a
- * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The
- * API also works with the free <a
- * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
- * </p>
- * <p>
- * Depending on the service level agreement, you have with the GeoIP service
- * provider, the plugin can add a number of the following fields to the index
- * data model:
- * <ol>
- * <li>Continent</li>
- * <li>Country</li>
- * <li>Regional Subdivision</li>
- * <li>City</li>
- * <li>Postal Code</li>
- * <li>Latitude/Longitude</li>
- * <li>ISP/Organization</li>
- * <li>AS Number</li>
- * <li>Confidence Factors</li>
- * <li>Radius</li>
- * <li>User Type</li>
- * </ol>
- * </p>
- * 
- * <p>
- * Some of the services are documented at the <a
- * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision
- * Services</a> webpage where more information can be obtained.
- * </p>
- * 
- * <p>
- * You should also consult the following three properties in
- * <code>nutch-site.xml</code>
- * </p>
- * 
- * <pre>
- *  {@code
- * <!-- index-geoip plugin properties -->
- * <property>
- *   <name>index.geoip.usage</name>
- *   <value>insightsService</value>
- *   <description>
- *   A string representing the information source to be used for GeoIP information
- *   association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
- *   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
- *   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
- *   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
- *   and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
- *   </description>
- * </property>
- * 
- * <property>
- *   <name>index.geoip.userid</name>
- *   <value></value>
- *   <description>
- *   The userId associated with the GeoIP2 Precision Services account.
- *   </description>
- * </property>
- * 
- * <property>
- *   <name>index.geoip.licensekey</name>
- *   <value></value>
- *   <description>
- *   The license key associated with the GeoIP2 Precision Services account.
- *   </description>
- * </property>
- * }
- * </pre>
- * 
- */
-public class GeoIPIndexingFilter implements IndexingFilter {
-
-  private static final Logger LOG = LoggerFactory
-      .getLogger(GeoIPIndexingFilter.class);
-
-  private Configuration conf;
-
-  private String usage = null;
-
-  private File geoDb = null;
-
-  WebServiceClient client = null;
-
-  DatabaseReader reader = null;
-
-  // private AbstractResponse response = null;
-
-  /**
-   * Default constructor for this plugin
-   */
-  public GeoIPIndexingFilter() {
-  }
-
-  /**
-   * @see org.apache.hadoop.conf.Configurable#getConf()
-   */
-  @Override
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * @see org.apache.hadoop.conf.Configurable#setConf(org.apache.hadoop.conf.Configuration)
-   */
-  @Override
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    String use = conf.get("index.geoip.usage", "insightsService");
-    LOG.debug("GeoIP usage medium set to: {}", use);
-    if (use.equalsIgnoreCase("cityDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-City.mmdb").getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
-            .getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    } else if (use.equalsIgnoreCase("domainDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-Domain.mmdb").getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    } else if (use.equalsIgnoreCase("ispDatabase")) {
-      try {
-        geoDb = new File(conf.getResource("GeoIP2-ISP.mmdb").getFile());
-        buildDb();
-      } catch (Exception e) {
-        LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-      }
-    } else if (use.equalsIgnoreCase("insightsService")) {
-      client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
-          12345), conf.get("index.geoip.licensekey")).build();
-    }
-    usage = use;
-  }
-
-  private void buildDb() {
-    try {
-      reader = new DatabaseReader.Builder(geoDb).build();
-    } catch (IOException e) {
-      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
-    }
-  }
-
-  /**
-   * 
-   * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
-   *      org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
-   *      org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
-   */
-  @Override
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-    return addServerGeo(doc, parse.getData(), url.toString());
-  }
-
-  private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
-      String url) {
-
-    if (conf.getBoolean("store.ip.address", false) == true) {
-      try {
-        String serverIp = data.getContentMeta().get("_ip_");
-        if (serverIp != null) {
-          if (usage.equalsIgnoreCase("cityDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
-                reader);
-          } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
-                reader);
-          } else if (usage.equalsIgnoreCase("domainDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
-                reader);
-          } else if (usage.equalsIgnoreCase("ispDatabase")) {
-            doc = GeoIPDocumentCreator
-                .createDocFromIspDb(serverIp, doc, reader);
-          } else if (usage.equalsIgnoreCase("insightsService")) {
-            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
-                doc, client);
-          }
-        }
-      } catch (Exception e) {
-        LOG.error(e.getMessage());
-        e.printStackTrace();
-      }
-    }
-    return doc;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
deleted file mode 100644
index ba62519..0000000
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/package-info.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * <p>This plugin implements an indexing filter which takes 
- * advantage of the 
- * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
- * <p>The third party library distribution provides an API for the GeoIP2 
- * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> 
- * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. 
- * The API also works with the free 
- * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
- *
- */
-package org.apache.nutch.indexer.geoip;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/build.xml b/src/plugin/index-links/build.xml
deleted file mode 100644
index b853ccf..0000000
--- a/src/plugin/index-links/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-links" default="jar-core">
-
-    <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/ivy.xml b/src/plugin/index-links/ivy.xml
deleted file mode 100644
index 0a363f7..0000000
--- a/src/plugin/index-links/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/plugin.xml b/src/plugin/index-links/plugin.xml
deleted file mode 100644
index dfdc5d2..0000000
--- a/src/plugin/index-links/plugin.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-    id="index-links"
-    name="Index inlinks and outlinks"
-    version="1.0.0"
-    provider-name="nutch.org">
-
-    <runtime>
-        <library name="index-links.jar">
-            <export name="*"/>
-        </library>
-    </runtime>
-
-    <requires>
-        <import plugin="nutch-extensionpoints"/>
-    </requires>
-
-    <extension id="org.apache.nutch.indexer.links.LinksIndexingFilter"
-               name="Links indexing filter"
-               point="org.apache.nutch.indexer.IndexingFilter">
-        <implementation id="org.apache.nutch.indexer.links.LinksIndexingFilter"
-                        class="org.apache.nutch.indexer.links.LinksIndexingFilter"/>
-    </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
deleted file mode 100644
index 975df66..0000000
--- a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
+++ /dev/null
@@ -1,167 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.links;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.slf4j.LoggerFactory;
-
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-/**
- * An {@link org.apache.nutch.indexer.IndexingFilter} that adds
- * <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
- *
- * In case that you want to ignore the outlinks that point to the same host
- * as the URL being indexed use the following settings in your configuration
- * file:
- *
- * <property>
- *   <name>index.links.outlinks.host.ignore</name>
- *   <value>true</value>
- * </property>
- *
- * The same configuration is available for inlinks:
- *
- * <property>
- *   <name>index.links.inlinks.host.ignore</name>
- *   <value>true</value>
- * </property>
- *
- * To store only the host portion of each inlink URL or outlink URL add the
- * following to your configuration file.
- *
- * <property>
- *   <name>index.links.hosts.only</name>
- *   <value>false</value>
- * </property>
- *
- */
-public class LinksIndexingFilter implements IndexingFilter {
-
-  public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore";
-  public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore";
-  public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
-
-  public final static org.slf4j.Logger LOG = LoggerFactory
-      .getLogger(LinksIndexingFilter.class);
-
-  private Configuration conf;
-  private boolean filterOutlinks;
-  private boolean filterInlinks;
-  private boolean indexHost;
-
-  @Override
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    // Add the outlinks
-    Outlink[] outlinks = parse.getData().getOutlinks();
-
-    if (outlinks != null) {
-      Set<String> hosts = new HashSet<String>();
-
-      for (Outlink outlink : outlinks) {
-        try {
-          String linkUrl = outlink.getToUrl();
-          String outHost = new URL(linkUrl).getHost().toLowerCase();
-
-          if (indexHost) {
-            linkUrl = outHost;
-
-            if (hosts.contains(linkUrl))
-              continue;
-
-            hosts.add(linkUrl);
-          }
-
-          addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
-              filterOutlinks, doc);
-        } catch (MalformedURLException e) {
-          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
-        }
-      }
-    }
-
-    // Add the inlinks
-    if (null != inlinks) {
-      Iterator<Inlink> iterator = inlinks.iterator();
-      Set<String> inlinkHosts = new HashSet<String>();
-
-      while (iterator.hasNext()) {
-        try {
-          Inlink link = iterator.next();
-          String linkUrl = link.getFromUrl();
-          String inHost = new URL(linkUrl).getHost().toLowerCase();
-
-          if (indexHost) {
-            linkUrl = inHost;
-
-            if (inlinkHosts.contains(linkUrl))
-              continue;
-
-            inlinkHosts.add(linkUrl);
-          }
-
-          addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
-              filterInlinks, doc);
-        } catch (MalformedURLException e) {
-          LOG.error("Malformed URL in {}: {}", url, e.getMessage());
-        }
-      }
-    }
-
-    return doc;
-  }
-
-  private void addFilteredLink(String fieldName, String url, String linkUrl,
-      String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException {
-      if (filter) {
-        String host = new URL(url.toString()).getHost().toLowerCase();
-
-        if (!host.equalsIgnoreCase(urlHost)) {
-          doc.add(fieldName, linkUrl);
-        }
-      } else {
-        doc.add(fieldName, linkUrl);
-      }
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
-    filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
-
-    indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java b/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
deleted file mode 100644
index c490d1f..0000000
--- a/src/plugin/index-links/src/test/org/apache/nutch/indexer/links/TestLinksIndexingFilter.java
+++ /dev/null
@@ -1,218 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.indexer.links;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlink;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.indexer.NutchField;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseImpl;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
-import java.net.URL;
-import java.util.Iterator;
-
-public class TestLinksIndexingFilter {
-
-  Configuration conf = NutchConfiguration.create();
-  LinksIndexingFilter filter = new LinksIndexingFilter();
-  Metadata metadata = new Metadata();
-
-  @Before
-  public void setUp() throws Exception {
-    metadata.add(Response.CONTENT_TYPE, "text/html");
-  }
-
-  private Outlink[] generateOutlinks() throws Exception {
-    return generateOutlinks(false);
-  }
-
-  private Outlink[] generateOutlinks(boolean parts) throws Exception {
-    Outlink[] outlinks = new Outlink[2];
-
-    outlinks[0] = new Outlink("http://www.test.com", "test");
-    outlinks[1] = new Outlink("http://www.example.com", "example");
-
-    if (parts) {
-      outlinks[0] = new Outlink(outlinks[0].getToUrl() + "/index.php?param=1",
-          "test");
-      outlinks[1] = new Outlink(outlinks[1].getToUrl() + "/index.php?param=2",
-          "test");
-    }
-
-    return outlinks;
-  }
-
-  @Test
-  public void testFilterOutlinks() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-    filter.setConf(conf);
-
-    Outlink[] outlinks = generateOutlinks();
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
-    Assert.assertEquals("Filter outlinks, allow only those from a different host",
-        outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
-  }
-
-  @Test
-  public void testFilterInlinks() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-    filter.setConf(conf);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com", "test"));
-    inlinks.add(new Inlink("http://www.example.com", "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
-    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
-    Assert.assertEquals("Filter inlinks, allow only those from a different host",
-        "http://www.test.com", doc.getFieldValue("inlinks"));
-  }
-
-  @Test
-  public void testNoFilterOutlinks() throws Exception {
-    filter.setConf(conf);
-
-    Outlink[] outlinks = generateOutlinks();
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals("All outlinks must be indexed even those from the same host",
-        outlinks.length, doc.getField("outlinks").getValues().size());
-  }
-
-  @Test
-  public void testNoFilterInlinks() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
-    filter.setConf(conf);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com", "test"));
-    inlinks.add(new Inlink("http://www.example.com", "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
-    Assert.assertEquals("All inlinks must be indexed even those from the same host",
-        inlinks.size(), doc.getField("inlinks").getValues().size());
-  }
-
-  @Test
-  public void testIndexOnlyHostPart() throws Exception {
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
-    filter.setConf(conf);
-
-    Outlink[] outlinks = generateOutlinks(true);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
-    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
-    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example",
-        "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
-    NutchField docOutlinks = doc.getField("outlinks");
-
-    Assert.assertEquals("Only the host portion of the outlink URL must be indexed",
-        new URL("http://www.test.com").getHost(),
-        docOutlinks.getValues().get(0));
-
-    Assert.assertEquals(
-        "The inlinks coming from the same host must count only once", 1,
-        doc.getField("inlinks").getValues().size());
-
-    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed",
-        new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
-  }
-
-  @Test
-  public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
-    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
-
-    Outlink[] outlinks = generateOutlinks(true);
-
-    filter.setConf(conf);
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", outlinks, metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
-
-    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
-
-    Assert.assertEquals(
-        "Index only the host portion of the outlinks after filtering",
-        new URL("http://www.test.com").getHost(),
-        doc.getFieldValue("outlinks"));
-  }
-
-  @Test
-  public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
-    conf = NutchConfiguration.create();
-    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
-    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
-
-    filter.setConf(conf);
-
-    Inlinks inlinks = new Inlinks();
-    inlinks.add(new Inlink("http://www.test.com", "test"));
-    inlinks.add(new Inlink("http://www.example.com", "example"));
-
-    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text",
-            new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)),
-        new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
-
-    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
-
-    Assert.assertEquals(
-        "Index only the host portion of the inlinks after filtering",
-        new URL("http://www.test.com").getHost(),
-        doc.getFieldValue("inlinks"));
-
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java b/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
deleted file mode 100644
index aaaedbf..0000000
--- a/src/plugin/index-links/src/test/org/apache/nutch/parse/TestOutlinks.java
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse;
-
-import org.junit.Test;
-
-import java.util.HashSet;
-import java.util.Set;
-
-import static org.junit.Assert.*;
-
-public class TestOutlinks {
-
-  @Test
-  public void testAddSameObject() throws Exception {
-    Set<Outlink> set = new HashSet<>();
-
-    Outlink o = new Outlink("http://www.example.com", "Example");
-    set.add(o);
-    set.add(o);
-
-    assertEquals("Adding the same Outlink twice", 1, set.size());
-  }
-
-  @Test
-  public void testAddOtherObjectWithSameData() throws Exception {
-    Set<Outlink> set = new HashSet<>();
-
-    Outlink o = new Outlink("http://www.example.com", "Example");
-    Outlink o1 = new Outlink("http://www.example.com", "Example");
-
-    assertTrue("The two Outlink objects are the same", o.equals(o1));
-
-    set.add(o);
-    set.add(o1);
-
-    assertEquals("The set should contain only 1 Outlink", 1, set.size());
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/build.xml b/src/plugin/index-metadata/build.xml
deleted file mode 100644
index ad96d11..0000000
--- a/src/plugin/index-metadata/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-metadata" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/ivy.xml b/src/plugin/index-metadata/ivy.xml
deleted file mode 100644
index 24d7606..0000000
--- a/src/plugin/index-metadata/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../../ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/plugin.xml b/src/plugin/index-metadata/plugin.xml
deleted file mode 100644
index 4d4c9a7..0000000
--- a/src/plugin/index-metadata/plugin.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="index-metadata"
-   name="Index Metadata"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-    <runtime>
-      <library name="index-metadata.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-
-   <extension id="org.apache.nutch.indexer.metadata"
-              name="Nutch metadata indexer"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="MetadataIndexer"
-                      class="org.apache.nutch.indexer.metadata.MetadataIndexer"/>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
----------------------------------------------------------------------
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
deleted file mode 100644
index 78718aa..0000000
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.metadata;
-
-import java.util.HashMap;
-import java.util.Locale;
-import java.util.Map;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.parse.Parse;
-
-/**
- * Indexer which can be configured to extract metadata from the crawldb, parse
- * metadata or content metadata. You can specify the properties "index.db.md",
- * "index.parse.md" or "index.content.md" who's values are comma-delimited
- * <value>key1,key2,key3</value>.
- */
-public class MetadataIndexer implements IndexingFilter {
-  private Configuration conf;
-  private String[] dbFieldnames;
-  private Map<String, String> parseFieldnames;
-  private String[] contentFieldnames;
-  private static final String db_CONF_PROPERTY = "index.db.md";
-  private static final String parse_CONF_PROPERTY = "index.parse.md";
-  private static final String content_CONF_PROPERTY = "index.content.md";
-
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    // just in case
-    if (doc == null)
-      return doc;
-
-    // add the fields from crawldb
-    if (dbFieldnames != null) {
-      for (String metatag : dbFieldnames) {
-        Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
-        if (metadata != null)
-          doc.add(metatag, metadata.toString());
-      }
-    }
-
-    // add the fields from parsemd
-    if (parseFieldnames != null) {
-      for (String metatag : parseFieldnames.keySet()) {
-        for (String value : parse.getData().getParseMeta().getValues(metatag)) {
-          if (value != null)
-            doc.add(parseFieldnames.get(metatag), value);
-        }
-      }
-    }
-
-    // add the fields from contentmd
-    if (contentFieldnames != null) {
-      for (String metatag : contentFieldnames) {
-        for (String value : parse.getData().getContentMeta().getValues(metatag)) {
-          if (value != null)
-            doc.add(metatag, value);
-        }
-      }
-    }
-
-    return doc;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
-    parseFieldnames = new HashMap<String, String>();
-    for (String metatag : conf.getStrings(parse_CONF_PROPERTY)) {
-      parseFieldnames.put(metatag.toLowerCase(Locale.ROOT), metatag);
-    }
-    contentFieldnames = conf.getStrings(content_CONF_PROPERTY);
-
-    // TODO check conflict between field names e.g. could have same label
-    // from different sources
-
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}