You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:06 UTC

[22/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
deleted file mode 100755
index 75ae2e7..0000000
--- a/src/plugin/build.xml
+++ /dev/null
@@ -1,213 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="Nutch" default="deploy-core" basedir=".">
-
-  <target name="deploy-core">
-    <ant target="compile-core" inheritall="false" dir="../.."/>
-    <ant target="deploy"/>
-  </target>
-
-  <!-- ====================================================== -->
-  <!-- Build & deploy all the plugin jars.                    -->
-  <!-- ====================================================== -->
-  <target name="deploy">
-     <ant dir="creativecommons" target="deploy"/>
-     <ant dir="feed" target="deploy"/>
-     <ant dir="headings" target="deploy"/>
-     <ant dir="index-basic" target="deploy"/>
-     <ant dir="index-anchor" target="deploy"/>
-     <ant dir="index-geoip" target="deploy"/>
-     <ant dir="index-more" target="deploy"/>
-     <ant dir="index-replace" target="deploy"/>
-     <ant dir="index-static" target="deploy"/>
-     <ant dir="index-metadata" target="deploy"/>
-     <ant dir="index-links" target="deploy"/>
-     <ant dir="mimetype-filter" target="deploy"/>
-     <ant dir="indexer-cloudsearch" target="deploy"/>
-     <ant dir="indexer-dummy" target="deploy"/>
-     <ant dir="indexer-elastic" target="deploy"/>
-     <ant dir="indexer-solr" target="deploy"/>
-     <ant dir="language-identifier" target="deploy"/>
-     <ant dir="lib-http" target="deploy"/>
-     <ant dir="lib-nekohtml" target="deploy"/>
-     <ant dir="lib-regex-filter" target="deploy"/>
-     <ant dir="lib-xml" target="deploy"/>
-     <ant dir="microformats-reltag" target="deploy"/>
-     <ant dir="nutch-extensionpoints" target="deploy"/>
-     <ant dir="protocol-file" target="deploy"/>
-     <ant dir="protocol-ftp" target="deploy"/>
-     <ant dir="protocol-http" target="deploy"/>
-     <ant dir="protocol-httpclient" target="deploy"/>
-     <ant dir="lib-htmlunit" target="deploy"/>
-     <ant dir="protocol-htmlunit" target="deploy" />
-     <ant dir="lib-selenium" target="deploy"/>
-     <ant dir="protocol-selenium" target="deploy" />
-     <ant dir="protocol-interactiveselenium" target="deploy" />
-     <ant dir="parse-ext" target="deploy"/>
-     <ant dir="parse-js" target="deploy"/>
-     <ant dir="parse-html" target="deploy"/>
-     <ant dir="parse-metatags" target="deploy"/>
-     <ant dir="parse-swf" target="deploy"/>
-     <ant dir="parse-tika" target="deploy"/>
-     <ant dir="parse-zip" target="deploy"/>
-     <ant dir="scoring-depth" target="deploy"/>
-     <ant dir="scoring-opic" target="deploy"/>
-     <ant dir="scoring-link" target="deploy"/>
-     <ant dir="scoring-similarity" target="deploy"/>
-     <ant dir="subcollection" target="deploy"/>
-     <ant dir="tld" target="deploy"/>
-     <ant dir="urlfilter-automaton" target="deploy"/>
-     <ant dir="urlfilter-domain" target="deploy" />
-     <ant dir="urlfilter-domainblacklist" target="deploy" />
-     <ant dir="urlfilter-prefix" target="deploy"/>
-     <ant dir="urlfilter-regex" target="deploy"/>
-     <ant dir="urlfilter-suffix" target="deploy"/>
-     <ant dir="urlfilter-validator" target="deploy"/>
-     <ant dir="urlfilter-ignoreexempt" target="deploy"/>
-     <ant dir="parsefilter-naivebayes" target="deploy"/>
-     <ant dir="parsefilter-regex" target="deploy"/>
-     <ant dir="urlmeta" target="deploy"/>
-     <ant dir="urlnormalizer-ajax" target="deploy"/>
-     <ant dir="urlnormalizer-basic" target="deploy"/>
-     <ant dir="urlnormalizer-host" target="deploy"/>
-     <ant dir="urlnormalizer-pass" target="deploy"/>
-     <ant dir="urlnormalizer-protocol" target="deploy"/>
-     <ant dir="urlnormalizer-querystring" target="deploy"/>
-     <ant dir="urlnormalizer-regex" target="deploy"/>
-     <ant dir="urlnormalizer-slash" target="deploy"/>
-  </target>
-
-  <!-- ====================================================== -->
-  <!-- Test all of the plugins.                               -->
-  <!-- ====================================================== -->
-  <target name="test">
-    <parallel threadCount="2">
-     <ant dir="creativecommons" target="test"/>
-     <ant dir="index-basic" target="test"/>
-     <ant dir="index-anchor" target="test"/>
-     <ant dir="index-geoip" target="test"/>
-     <ant dir="index-more" target="test"/>
-     <ant dir="index-static" target="test"/>
-     <ant dir="index-replace" target="test"/>
-     <ant dir="index-links" target="test"/>
-     <ant dir="mimetype-filter" target="test"/>
-     <ant dir="language-identifier" target="test"/>
-     <ant dir="lib-http" target="test"/>
-     <ant dir="protocol-file" target="test"/>
-     <ant dir="protocol-http" target="test"/>
-     <ant dir="protocol-httpclient" target="test"/>
-     <!--ant dir="parse-ext" target="test"/-->
-     <ant dir="feed" target="test"/>
-     <ant dir="parse-html" target="test"/>
-     <ant dir="parse-metatags" target="test"/>
-     <ant dir="parse-swf" target="test"/>
-     <ant dir="parse-tika" target="test"/>
-     <ant dir="parse-zip" target="test"/>
-     <ant dir="parsefilter-regex" target="test"/>
-     <ant dir="subcollection" target="test"/>
-     <ant dir="urlfilter-automaton" target="test"/>
-     <ant dir="urlfilter-domain" target="test"/>
-     <ant dir="urlfilter-domainblacklist" target="test"/>
-     <ant dir="urlfilter-prefix" target="test"/>
-     <ant dir="urlfilter-regex" target="test"/>
-     <ant dir="urlfilter-suffix" target="test"/>
-     <ant dir="urlfilter-validator" target="test"/>
-     <ant dir="urlfilter-ignoreexempt" target="test"/>
-     <ant dir="urlnormalizer-ajax" target="test"/>
-     <ant dir="urlnormalizer-basic" target="test"/>
-     <ant dir="urlnormalizer-host" target="test"/>
-     <ant dir="urlnormalizer-pass" target="test"/>
-     <ant dir="urlnormalizer-protocol" target="test"/>
-     <ant dir="urlnormalizer-querystring" target="test"/>
-     <ant dir="urlnormalizer-regex" target="test"/>
-     <ant dir="urlnormalizer-slash" target="test"/>
-    </parallel>
-  </target>
-
-  <!-- ====================================================== -->
-  <!-- Clean all of the plugins.                              -->
-  <!-- ====================================================== -->
-  <target name="clean">
-    <ant dir="creativecommons" target="clean"/>
-    <ant dir="feed" target="clean"/>
-    <ant dir="headings" target="clean"/>
-    <ant dir="index-basic" target="clean"/>
-    <ant dir="index-anchor" target="clean"/>
-    <ant dir="index-geoip" target="clean"/>
-    <ant dir="index-more" target="clean"/>
-    <ant dir="index-static" target="clean"/>
-    <ant dir="index-replace" target="clean"/>
-    <ant dir="index-metadata" target="clean"/>
-    <ant dir="index-links" target="clean"/>
-    <ant dir="mimetype-filter" target="clean"/>
-    <ant dir="indexer-cloudsearch" target="clean"/>
-    <ant dir="indexer-dummy" target="clean"/>
-    <ant dir="indexer-elastic" target="clean"/>
-    <ant dir="indexer-solr" target="clean"/>
-    <ant dir="language-identifier" target="clean"/>
-    <!-- <ant dir="lib-commons-httpclient" target="clean"/> -->
-    <ant dir="lib-http" target="clean"/>
-    <!-- <ant dir="lib-lucene-analyzers" target="clean"/>-->
-    <ant dir="lib-nekohtml" target="clean"/>
-    <ant dir="lib-regex-filter" target="clean"/>
-    <ant dir="lib-xml" target="clean"/>
-    <ant dir="microformats-reltag" target="clean"/>
-    <ant dir="nutch-extensionpoints" target="clean"/>
-    <ant dir="protocol-file" target="clean"/>
-    <ant dir="protocol-ftp" target="clean"/>
-    <ant dir="protocol-http" target="clean"/>
-    <ant dir="protocol-httpclient" target="clean"/>
-    <ant dir="lib-htmlunit" target="clean"/>
-    <ant dir="protocol-htmlunit" target="clean" />
-    <ant dir="lib-selenium" target="clean"/>
-    <ant dir="protocol-selenium" target="clean" />
-    <ant dir="protocol-interactiveselenium" target="clean" />
-    <ant dir="parse-ext" target="clean"/>
-    <ant dir="parse-js" target="clean"/>
-    <ant dir="parse-html" target="clean"/>
-    <ant dir="parse-metatags" target="clean"/>
-    <ant dir="parse-swf" target="clean"/>
-    <ant dir="parse-tika" target="clean"/>
-    <ant dir="parse-zip" target="clean"/>
-    <ant dir="parsefilter-regex" target="clean"/>
-    <ant dir="scoring-depth" target="clean"/>
-    <ant dir="scoring-opic" target="clean"/>
-    <ant dir="scoring-link" target="clean"/>
-    <ant dir="scoring-similarity" target="clean"/>
-    <ant dir="subcollection" target="clean"/>
-    <ant dir="tld" target="clean"/>
-    <ant dir="urlfilter-automaton" target="clean"/>
-    <ant dir="urlfilter-domain" target="clean" />
-    <ant dir="urlfilter-domainblacklist" target="clean" />
-    <ant dir="urlfilter-prefix" target="clean"/>
-    <ant dir="urlfilter-regex" target="clean"/>
-    <ant dir="urlfilter-suffix" target="clean"/>
-    <ant dir="urlfilter-validator" target="clean"/>
-    <ant dir="urlfilter-ignoreexempt" target="clean"/>
-    <ant dir="parsefilter-naivebayes" target="clean" />
-    <ant dir="urlmeta" target="clean"/>
-    <ant dir="urlnormalizer-ajax" target="clean"/>
-    <ant dir="urlnormalizer-basic" target="clean"/>
-    <ant dir="urlnormalizer-host" target="clean"/>
-    <ant dir="urlnormalizer-pass" target="clean"/>
-    <ant dir="urlnormalizer-protocol" target="clean"/>
-    <ant dir="urlnormalizer-querystring" target="clean"/>
-    <ant dir="urlnormalizer-regex" target="clean"/>
-    <ant dir="urlnormalizer-slash" target="clean"/>
-  </target>
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/README.txt b/src/plugin/creativecommons/README.txt
deleted file mode 100644
index d4d7b65..0000000
--- a/src/plugin/creativecommons/README.txt
+++ /dev/null
@@ -1 +0,0 @@
-Support for crawling and searching Creative-Commons licensed content. 

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/build.xml b/src/plugin/creativecommons/build.xml
deleted file mode 100755
index 6443d7f..0000000
--- a/src/plugin/creativecommons/build.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="creativecommons" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-   <!--  <ant target="deploy" inheritall="false" dir="../parse-html"/> -->
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/conf/crawl-urlfilter.txt
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
deleted file mode 100644
index 324617f..0000000
--- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Creative Commnons crawl filter
-
-# Each non-comment, non-blank line contains a regular expression
-# prefixed by '+' or '-'.  The first matching pattern in the file
-# determines whether a URL is included or ignored.  If no pattern
-# matches, the URL is ignored.
-
-# skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto|https):
-
-# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$
-
-# skip URLs containing certain characters as probable queries, etc.
--[?*!@=]
-
-# accept anything else
-+.

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/conf/nutch-site.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml
deleted file mode 100644
index 71e344b..0000000
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
-
-<!-- Creative Commons' Nutch configuration -->
-
-<nutch-conf>
-
-<property>
-  <name>http.agent.name</name>
-  <value>CreativeCommons</value>
-  <description>Our HTTP 'User-Agent' request header.</description>
-</property>
-
-<property>
-  <name>http.robots.agents</name>
-  <value>CreativeCommons,Nutch,*</value>
-  <description>The agent strings we'll look for in robots.txt files,
-  comma-separated, in decreasing order of precedence.</description>
-</property>
-
-<property>
-  <name>fetcher.server.delay</name>
-  <value>2.0</value>
-  <description>We need to be more polite than when crawling an
-  intranet that we control.</description>
-</property>
-
-<property>
-  <name>http.max.delays</name>
-  <value>3</value>
-  <description>The CC crawl visits a large number of different
-  hosts, so we should not need to delay much.</description>
-</property>
-
-<property>
-  <name>creativecommons.exclude.unlicensed</name>
-  <value>true</value>
-  <description>Exclude HTML content which does not contain a CC license.
-  </description>
-</property>
-
-<property>
-  <name>plugin.excludes</name>
-  <value>parse-(?!html).*</value>
-  <description>Exclude non-HTML content, since we don't know how to
-  find a CC license in anything but HTML. 
-  </description>
-</property>
-
-</nutch-conf>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/anchor.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html
deleted file mode 100755
index 90b5227..0000000
--- a/src/plugin/creativecommons/data/anchor.html
+++ /dev/null
@@ -1,9 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
-<html>
-<head>
-</head>
-<body>
-<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a
-<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/rdf.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html
deleted file mode 100755
index fb2c34d..0000000
--- a/src/plugin/creativecommons/data/rdf.html
+++ /dev/null
@@ -1,35 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- </head>
- <body>
-
-<!-- Creative Commons License -->
-<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br />
-This work is licensed under a
-<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>.
-<!--  end Creative Commons License -->
-
-  <!--
-<rdf:RDF xmlns="http://web.resource.org/cc/"
-    xmlns:dc="http://purl.org/dc/elements/1.1/"
-    xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
-<Work rdf:about="http://boingboing.net">
-   <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" />
-   <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" />
-</Work>
-
-<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0">
-   <requires rdf:resource="http://web.resource.org/cc/Attribution" />
-   <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" />
-   <permits rdf:resource="http://web.resource.org/cc/Reproduction" />
-   <permits rdf:resource="http://web.resource.org/cc/Distribution" />
-   <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" />
-   <requires rdf:resource="http://web.resource.org/cc/Notice" />
-</License>
-
-</rdf:RDF>
-
--->
- </body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/rel.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html
deleted file mode 100755
index 413d52f..0000000
--- a/src/plugin/creativecommons/data/rel.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>
-</head><body>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a> 
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/creativecommons/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/plugin.xml b/src/plugin/creativecommons/plugin.xml
deleted file mode 100755
index de9cf36..0000000
--- a/src/plugin/creativecommons/plugin.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="creativecommons"
-   name="Creative Commons Plugins"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="creativecommons.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.creativecommons.nutch.CCParseFilter"
-              name="Creative Commons Metadata Filter"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-      <implementation id="CCParseFilter"
-                      class="org.creativecommons.nutch.CCParseFilter"/>
-   </extension>
-
-   <extension id="org.creativecommons.nutch.CCIndexingFilter"
-              name="Creative Commons Indexing Filter"
-              point="org.apache.nutch.indexer.IndexingFilter">
-      <implementation id="CCIndexingFilter"
-                      class="org.creativecommons.nutch.CCIndexingFilter"/>
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
deleted file mode 100644
index e7c55c4..0000000
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.CreativeCommons;
-
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.hadoop.conf.Configuration;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.*;
-import java.net.URL;
-import java.net.MalformedURLException;
-
-/** Adds basic searchable fields to a document. */
-public class CCIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory
-      .getLogger(CCIndexingFilter.class);
-
-  /** The name of the document field we use. */
-  public static String FIELD = "cc";
-
-  private Configuration conf;
-
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
-    Metadata metadata = parse.getData().getParseMeta();
-    // index the license
-    String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
-    if (licenseUrl != null) {
-      if (LOG.isInfoEnabled()) {
-        LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
-      }
-
-      // add the entire license as cc:license=xxx
-      addFeature(doc, "license=" + licenseUrl);
-
-      // index license attributes extracted of the license url
-      addUrlFeatures(doc, licenseUrl);
-    }
-
-    // index the license location as cc:meta=xxx
-    String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
-    if (licenseLocation != null) {
-      addFeature(doc, "meta=" + licenseLocation);
-    }
-
-    // index the work type cc:type=xxx
-    String workType = metadata.get(CreativeCommons.WORK_TYPE);
-    if (workType != null) {
-      addFeature(doc, workType);
-    }
-
-    return doc;
-  }
-
-  /**
-   * Add the features represented by a license URL. Urls are of the form
-   * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
-   * license feature.
-   */
-  public void addUrlFeatures(NutchDocument doc, String urlString) {
-    try {
-      URL url = new URL(urlString);
-
-      // tokenize the path of the url, breaking at slashes and dashes
-      StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
-
-      if (names.hasMoreTokens())
-        names.nextToken(); // throw away "licenses"
-
-      // add a feature per component after "licenses"
-      while (names.hasMoreTokens()) {
-        String feature = names.nextToken();
-        addFeature(doc, feature);
-      }
-    } catch (MalformedURLException e) {
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
-      }
-    }
-  }
-
-  private void addFeature(NutchDocument doc, String feature) {
-    doc.add(FIELD, feature);
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
deleted file mode 100644
index 1fa951e..0000000
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ /dev/null
@@ -1,300 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.CreativeCommons;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.hadoop.conf.Configuration;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.*;
-import java.io.*;
-import java.net.*;
-import javax.xml.parsers.*;
-import org.xml.sax.InputSource;
-import org.w3c.dom.*;
-
-/** Adds metadata identifying the Creative Commons license used, if any. */
-public class CCParseFilter implements HtmlParseFilter {
-  public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
-
-  /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
-  public static class Walker {
-    private URL base; // base url of page
-    private String rdfLicense; // subject url found, if any
-    private URL relLicense; // license url found, if any
-    private URL anchorLicense; // anchor url found, if any
-    private String workType; // work type URI
-
-    private Walker(URL base) {
-      this.base = base;
-    }
-
-    /** Scan the document adding attributes to metadata. */
-    public static void walk(Node doc, URL base, Metadata metadata,
-        Configuration conf) throws ParseException {
-
-      // walk the DOM tree, scanning for license data
-      Walker walker = new Walker(base);
-      walker.walk(doc);
-
-      // interpret results of walk
-      String licenseUrl = null;
-      String licenseLocation = null;
-      if (walker.rdfLicense != null) { // 1st choice: subject in RDF
-        licenseLocation = "rdf";
-        licenseUrl = walker.rdfLicense;
-      } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
-        licenseLocation = "rel";
-        licenseUrl = walker.relLicense.toString();
-      } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
-        licenseLocation = "a";
-        licenseUrl = walker.anchorLicense.toString();
-      } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
-        throw new ParseException("No CC license.  Excluding.");
-      }
-
-      // add license to metadata
-      if (licenseUrl != null) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("CC: found " + licenseUrl + " in " + licenseLocation
-              + " of " + base);
-        }
-        metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
-        metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
-      }
-
-      if (walker.workType != null) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("CC: found " + walker.workType + " in " + base);
-        }
-        metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
-      }
-
-    }
-
-    /** Scan the document looking for RDF in comments and license elements. */
-    private void walk(Node node) {
-
-      // check element nodes for license URL
-      if (node instanceof Element) {
-        findLicenseUrl((Element) node);
-      }
-
-      // check comment nodes for license RDF
-      if (node instanceof Comment) {
-        findRdf(((Comment) node).getData());
-      }
-
-      // recursively walk child nodes
-      NodeList children = node.getChildNodes();
-      for (int i = 0; children != null && i < children.getLength(); i++) {
-        walk(children.item(i));
-      }
-    }
-
-    /**
-     * Extract license url from element, if any. Thse are the href attribute of
-     * anchor elements with rel="license". These must also point to
-     * http://creativecommons.org/licenses/.
-     */
-    private void findLicenseUrl(Element element) {
-      // only look in Anchor elements
-      if (!"a".equalsIgnoreCase(element.getTagName()))
-        return;
-
-      // require an href
-      String href = element.getAttribute("href");
-      if (href == null)
-        return;
-
-      try {
-        URL url = new URL(base, href); // resolve the url
-
-        // check that it's a CC license URL
-        if ("http".equalsIgnoreCase(url.getProtocol())
-            && "creativecommons.org".equalsIgnoreCase(url.getHost())
-            && url.getPath() != null && url.getPath().startsWith("/licenses/")
-            && url.getPath().length() > "/licenses/".length()) {
-
-          // check rel="license"
-          String rel = element.getAttribute("rel");
-          if (rel != null && "license".equals(rel) && this.relLicense == null) {
-            this.relLicense = url; // found rel license
-          } else if (this.anchorLicense == null) {
-            this.anchorLicense = url; // found anchor license
-          }
-        }
-      } catch (MalformedURLException e) { // ignore malformed urls
-      }
-    }
-
-    /** Configure a namespace aware XML parser. */
-    private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
-        .newInstance();
-    static {
-      FACTORY.setNamespaceAware(true);
-    }
-
-    /** Creative Commons' namespace URI. */
-    private static final String CC_NS = "http://web.resource.org/cc/";
-
-    /** Dublin Core namespace URI. */
-    private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
-
-    /** RDF syntax namespace URI. */
-    private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
-
-    private void findRdf(String comment) {
-      // first check for likely RDF in comment
-      int rdfPosition = comment.indexOf("RDF");
-      if (rdfPosition < 0)
-        return; // no RDF, abort
-      int nsPosition = comment.indexOf(CC_NS);
-      if (nsPosition < 0)
-        return; // no RDF, abort
-
-      // try to parse the XML
-      Document doc;
-      try {
-        DocumentBuilder parser = FACTORY.newDocumentBuilder();
-        doc = parser.parse(new InputSource(new StringReader(comment)));
-      } catch (Exception e) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
-        }
-        // e.printStackTrace();
-        return;
-      }
-
-      // check that root is rdf:RDF
-      NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
-      if (roots.getLength() != 1) {
-        if (LOG.isWarnEnabled()) {
-          LOG.warn("CC: No RDF root in " + base);
-        }
-        return;
-      }
-      Element rdf = (Element) roots.item(0);
-
-      // get cc:License nodes inside rdf:RDF
-      NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
-      for (int i = 0; i < licenses.getLength(); i++) {
-
-        Element l = (Element) licenses.item(i);
-
-        // license is rdf:about= attribute from cc:License
-        this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
-
-        // walk predicates of cc:License
-        NodeList predicates = l.getChildNodes();
-        for (int j = 0; j < predicates.getLength(); j++) {
-          Node predicateNode = predicates.item(j);
-          if (!(predicateNode instanceof Element))
-            continue;
-          Element predicateElement = (Element) predicateNode;
-
-          // extract predicates of cc:xxx predicates
-          if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
-            continue;
-          }
-
-          // add object and predicate to metadata
-          // metadata.put(object, predicate);
-          // if (LOG.isInfoEnabled()) {
-          // LOG.info("CC: found: "+predicate+"="+object);
-          // }
-        }
-      }
-
-      // get cc:Work nodes from rdf:RDF
-      NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
-      for (int i = 0; i < works.getLength(); i++) {
-        // get dc:type nodes from cc:Work
-        NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
-
-        for (int j = 0; j < types.getLength(); j++) {
-          Element type = (Element) types.item(j);
-          String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
-              .getValue();
-          this.workType = WORK_TYPE_NAMES.get(workUri);
-        }
-      }
-    }
-  }
-
-  private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();
-  static {
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
-        "interactive");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
-  }
-
-  private Configuration conf;
-
-  /**
-   * Adds metadata or otherwise modifies a parse of an HTML document, given the
-   * DOM tree of a page.
-   */
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-
-    // get parse obj
-    Parse parse = parseResult.get(content.getUrl());
-
-    // construct base url
-    URL base;
-    try {
-      base = new URL(content.getBaseUrl());
-    } catch (MalformedURLException e) {
-      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
-      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
-          emptyParse.getData());
-      return parseResult;
-    }
-
-    try {
-      // extract license metadata
-      Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
-    } catch (ParseException e) {
-      Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
-      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
-          emptyParse.getData());
-      return parseResult;
-    }
-
-    return parseResult;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html
deleted file mode 100644
index 0c91293..0000000
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Sample plugins that parse and index Creative Commons medadata.</p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
deleted file mode 100755
index 41be9ed..0000000
--- a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.io.*;
-
-public class TestCCParseFilter {
-
-  private static final File testDir = new File(System.getProperty("test.input"));
-
-  @Test
-  public void testPages() throws Exception {
-    pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
-        "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
-    // Tika returns <a> whereas parse-html returns <rel>
-    // check later
-    pageTest(new File(testDir, "rel.html"), "http://foo.com/",
-        "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
-    // Tika returns <a> whereas parse-html returns <rdf>
-    // check later
-    pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
-        "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
-  }
-
-  public void pageTest(File file, String url, String license, String location,
-      String type) throws Exception {
-
-    String contentType = "text/html";
-    InputStream in = new FileInputStream(file);
-    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
-    byte[] buffer = new byte[1024];
-    int i;
-    while ((i = in.read(buffer)) != -1) {
-      out.write(buffer, 0, i);
-    }
-    in.close();
-    byte[] bytes = out.toByteArray();
-    Configuration conf = NutchConfiguration.create();
-
-    Content content = new Content(url, url, bytes, contentType, new Metadata(),
-        conf);
-    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
-    Metadata metadata = parse.getData().getParseMeta();
-    Assert.assertEquals(license, metadata.get("License-Url"));
-    Assert.assertEquals(location, metadata.get("License-Location"));
-    Assert.assertEquals(type, metadata.get("Work-Type"));
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/feed/build.xml b/src/plugin/feed/build.xml
deleted file mode 100644
index 7fe7050..0000000
--- a/src/plugin/feed/build.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0"?>
-<!--
-	Licensed to the Apache Software Foundation (ASF) under one or more
-	contributor license agreements.  See the NOTICE file distributed with
-	this work for additional information regarding copyright ownership.
-	The ASF licenses this file to You under the Apache License, Version 2.0
-	(the "License"); you may not use this file except in compliance with
-	the License.  You may obtain a copy of the License at
-	
-	http://www.apache.org/licenses/LICENSE-2.0
-	
-	Unless required by applicable law or agreed to in writing, software
-	distributed under the License is distributed on an "AS IS" BASIS,
-	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	See the License for the specific language governing permissions and
-	limitations under the License.
--->
-
-<project name="feed" default="jar-core">
-
-    <import file="../build-plugin.xml" />
-    
-    <!-- Build compilation dependencies -->
-    <target name="deps-jar">
-      <ant target="jar" inheritall="false" dir="../lib-xml"/>
-    </target>
-
-    <!-- Add compilation dependencies to classpath -->
-    <path id="plugin.deps">
-      <fileset dir="${nutch.root}/build">
-        <include name="**/lib-xml/*.jar" />
-      </fileset>
-    </path>
-
-    <!-- Deploy Unit test dependencies -->
-    <target name="deps-test">
-      <ant target="deploy" inheritall="false"
-           dir="../nutch-extensionpoints" />
-      <ant target="deploy" inheritall="false" dir="../protocol-file" />
-    </target>
-    
-    <!-- for junit test -->
-    <mkdir dir="${build.test}/data" />
-    <copy file="sample/rsstest.rss" todir="${build.test}/data" />
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/feed/ivy.xml b/src/plugin/feed/ivy.xml
deleted file mode 100644
index c29bd03..0000000
--- a/src/plugin/feed/ivy.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-    <dependency org="rome" name="rome" rev="0.9" conf="*->master"/>
-    <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->master"/>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/feed/plugin.xml b/src/plugin/feed/plugin.xml
deleted file mode 100644
index 3a68d8d..0000000
--- a/src/plugin/feed/plugin.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0"?>
-<!--
-	Licensed to the Apache Software Foundation (ASF) under one or more
-	contributor license agreements.  See the NOTICE file distributed with
-	this work for additional information regarding copyright ownership.
-	The ASF licenses this file to You under the Apache License, Version 2.0
-	(the "License"); you may not use this file except in compliance with
-	the License.  You may obtain a copy of the License at
-	
-	http://www.apache.org/licenses/LICENSE-2.0
-	
-	Unless required by applicable law or agreed to in writing, software
-	distributed under the License is distributed on an "AS IS" BASIS,
-	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	See the License for the specific language governing permissions and
-	limitations under the License.
--->
-<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0"
-	 provider-name="nutch.org">
-    <runtime>
-      <library name="feed.jar">
-        <export name="*" />
-      </library>
-      <library name="rome-0.9.jar" />
-      <library name="jdom-1.1.jar" />
-    </runtime>
-    
-    <requires>
-      <import plugin="nutch-extensionpoints" />
-      <import plugin="lib-xml" />
-    </requires>
-    
-    <extension id="org.apache.nutch.parse.feed" name="Feed Parser"
-      point="org.apache.nutch.parse.Parser">
-      
-      <implementation id="org.apache.nutch.parse.feed.FeedParser"
-         class="org.apache.nutch.parse.feed.FeedParser">
-         <parameter name="contentType" value="application/rss+xml" />
-         <parameter name="contentType" value="application/atom+xml" />
-         <parameter name="contentType" value="text/xml" />
-         <parameter name="pathSuffix" value="rss" />
-     </implementation>
-    </extension>
-    <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer"
-       point="org.apache.nutch.indexer.IndexingFilter">
-     <implementation id="FeedIndexingFilter"
-       class="org.apache.nutch.indexer.feed.FeedIndexingFilter" />
-    </extension>
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/sample/rsstest.rss
----------------------------------------------------------------------
diff --git a/src/plugin/feed/sample/rsstest.rss b/src/plugin/feed/sample/rsstest.rss
deleted file mode 100644
index 758f6a1..0000000
--- a/src/plugin/feed/sample/rsstest.rss
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1" ?>
-<!--
-	Licensed to the Apache Software Foundation (ASF) under one or more
-	contributor license agreements.  See the NOTICE file distributed with
-	this work for additional information regarding copyright ownership.
-	The ASF licenses this file to You under the Apache License, Version 2.0
-	(the "License"); you may not use this file except in compliance with
-	the License.  You may obtain a copy of the License at
-	
-	http://www.apache.org/licenses/LICENSE-2.0
-	
-	Unless required by applicable law or agreed to in writing, software
-	distributed under the License is distributed on an "AS IS" BASIS,
-	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-	See the License for the specific language governing permissions and
-	limitations under the License.
--->
-<rss version="0.91">
-    <channel>
-      <title>TestChannel</title>
-      <link>http://test.channel.com/</link> 
-      <description>Sample RSS File for Junit test</description> 
-      <language>en-us</language>
-      
-      <item>
-        <title>Home Page of Chris Mattmann</title>
-        <link>http://www-scf.usc.edu/~mattmann/</link>
-        <description>Chris Mattmann's home page</description>
-      </item>
-      <item>
-        <title>Awesome Open Source Search Engine</title> 
-        <link>http://www.nutch.org/</link> 
-        <description>Yup, that's what it is</description> 
-      </item>
-   </channel>
-</rss>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
deleted file mode 100644
index 94b440a..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.feed;
-
-//JDK imports
-import java.util.Date;
-
-//APACHE imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Feed;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-
-/**
- * @author dogacan
- * @author mattmann
- * @since NUTCH-444
- * 
- *        An {@link IndexingFilter} implementation to pull out the relevant
- *        extracted {@link Metadata} fields from the RSS feeds and into the
- *        index.
- * 
- */
-public class FeedIndexingFilter implements IndexingFilter {
-
-  public static final String dateFormatStr = "yyyyMMddHHmm";
-
-  private Configuration conf;
-
-  private final static String PUBLISHED_DATE = "publishedDate";
-
-  private final static String UPDATED_DATE = "updatedDate";
-
-  /**
-   * Extracts out the relevant fields:
-   * 
-   * <ul>
-   * <li>FEED_AUTHOR</li>
-   * <li>FEED_TAGS</li>
-   * <li>FEED_PUBLISHED</li>
-   * <li>FEED_UPDATED</li>
-   * <li>FEED</li>
-   * </ul>
-   * 
-   * And sends them to the {@link Indexer} for indexing within the Nutch index.
-   * 
-   */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
-      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-    ParseData parseData = parse.getData();
-    Metadata parseMeta = parseData.getParseMeta();
-
-    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
-    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
-    String published = parseMeta.get(Feed.FEED_PUBLISHED);
-    String updated = parseMeta.get(Feed.FEED_UPDATED);
-    String feed = parseMeta.get(Feed.FEED);
-
-    if (authors != null) {
-      for (String author : authors) {
-        doc.add(Feed.FEED_AUTHOR, author);
-      }
-    }
-
-    if (tags != null) {
-      for (String tag : tags) {
-        doc.add(Feed.FEED_TAGS, tag);
-      }
-    }
-
-    if (feed != null)
-      doc.add(Feed.FEED, feed);
-
-    if (published != null) {
-      Date date = new Date(Long.parseLong(published));
-      doc.add(PUBLISHED_DATE, date);
-    }
-
-    if (updated != null) {
-      Date date = new Date(Long.parseLong(updated));
-      doc.add(UPDATED_DATE, date);
-    }
-
-    return doc;
-  }
-
-  /**
-   * @return the {@link Configuration} object used to configure this
-   *         {@link IndexingFilter}.
-   */
-  public Configuration getConf() {
-    return conf;
-  }
-
-  /**
-   * Sets the {@link Configuration} object used to configure this
-   * {@link IndexingFilter}.
-   * 
-   * @param conf
-   *          The {@link Configuration} object used to configure this
-   *          {@link IndexingFilter}.
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
deleted file mode 100644
index 8f52628..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Indexing filter to index meta data from RSS feeds.
- */
-package org.apache.nutch.indexer.feed;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
deleted file mode 100644
index 936c885..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
+++ /dev/null
@@ -1,374 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.feed;
-
-// JDK imports
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-
-// APACHE imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.StringUtils;
-// import org.apache.nutch.indexer.anchor.AnchorIndexingFilter; removed as per NUTCH-1078
-import org.apache.nutch.metadata.Feed;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.URLFilters;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.ParseText;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.ParserNotFound;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.EncodingDetector;
-import org.apache.nutch.util.NutchConfiguration;
-import org.xml.sax.InputSource;
-
-// ROME imports
-import com.sun.syndication.feed.synd.SyndCategory;
-import com.sun.syndication.feed.synd.SyndContent;
-import com.sun.syndication.feed.synd.SyndEntry;
-import com.sun.syndication.feed.synd.SyndFeed;
-import com.sun.syndication.feed.synd.SyndPerson;
-import com.sun.syndication.io.SyndFeedInput;
-
-/**
- * 
- * @author dogacan
- * @author mattmann
- * @since NUTCH-444
- * 
- *        <p>
- *        A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
- *        links and content present in the feed.
- *        </p>
- * 
- */
-public class FeedParser implements Parser {
-
-  public static final String CHARSET_UTF8 = "charset=UTF-8";
-
-  public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
-      + CHARSET_UTF8;
-
-  public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class);
-
-  private Configuration conf;
-
-  private ParserFactory parserFactory;
-
-  private URLNormalizers normalizers;
-
-  private URLFilters filters;
-
-  private String defaultEncoding;
-
-  /**
-   * Parses the given feed and extracts out and parsers all linked items within
-   * the feed, using the underlying ROME feed parsing library.
-   * 
-   * @param content
-   *          A {@link Content} object representing the feed that is being
-   *          parsed by this {@link Parser}.
-   * 
-   * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
-   *         present in the feed file that this {@link Parser} dealt with.
-   * 
-   */
-  public ParseResult getParse(Content content) {
-    SyndFeed feed = null;
-    ParseResult parseResult = new ParseResult(content.getUrl());
-
-    EncodingDetector detector = new EncodingDetector(conf);
-    detector.autoDetectClues(content, true);
-    String encoding = detector.guessEncoding(content, defaultEncoding);
-    try {
-      InputSource input = new InputSource(new ByteArrayInputStream(
-          content.getContent()));
-      input.setEncoding(encoding);
-      SyndFeedInput feedInput = new SyndFeedInput();
-      feed = feedInput.build(input);
-    } catch (Exception e) {
-      // return empty parse
-      LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
-          + StringUtils.stringifyException(e));
-      return new ParseStatus(e)
-          .getEmptyParseResult(content.getUrl(), getConf());
-    }
-
-    String feedLink = feed.getLink();
-    try {
-      feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
-      if (feedLink != null)
-        feedLink = filters.filter(feedLink);
-    } catch (Exception e) {
-      feedLink = null;
-    }
-
-    List<?> entries = feed.getEntries();
-    for (Object entry : entries) {
-      addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
-    }
-
-    String feedDesc = stripTags(feed.getDescriptionEx());
-    String feedTitle = stripTags(feed.getTitleEx());
-
-    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
-        new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
-        content.getMetadata()));
-
-    return parseResult;
-  }
-
-  /**
-   * 
-   * Sets the {@link Configuration} object for this {@link Parser}. This
-   * {@link Parser} expects the following configuration properties to be set:
-   * 
-   * <ul>
-   * <li>URLNormalizers - properties in the configuration object to set up the
-   * default url normalizers.</li>
-   * <li>URLFilters - properties in the configuration object to set up the
-   * default url filters.</li>
-   * </ul>
-   * 
-   * @param conf
-   *          The Hadoop {@link Configuration} object to use to configure this
-   *          {@link Parser}.
-   * 
-   */
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-    this.parserFactory = new ParserFactory(conf);
-    this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
-    this.filters = new URLFilters(conf);
-    this.defaultEncoding = conf.get("parser.character.encoding.default",
-        "windows-1252");
-  }
-
-  /**
-   * 
-   * @return The {@link Configuration} object used to configure this
-   *         {@link Parser}.
-   */
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Runs a command line version of this {@link Parser}.
-   * 
-   * @param args
-   *          A single argument (expected at arg[0]) representing a path on the
-   *          local filesystem that points to a feed file.
-   * 
-   * @throws Exception
-   *           If any error occurs.
-   */
-  public static void main(String[] args) throws Exception {
-    if (args.length != 1) {
-      System.err.println("Usage: FeedParser <feed>");
-      System.exit(1);
-    }
-    String name = args[0];
-    String url = "file:" + name;
-    Configuration conf = NutchConfiguration.create();
-    FeedParser parser = new FeedParser();
-    parser.setConf(conf);
-    File file = new File(name);
-    byte[] bytes = new byte[(int) file.length()];
-    DataInputStream in = new DataInputStream(new FileInputStream(file));
-    in.readFully(bytes);
-    ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
-        "application/rss+xml", new Metadata(), conf));
-    for (Entry<Text, Parse> entry : parseResult) {
-      System.out.println("key: " + entry.getKey());
-      Parse parse = entry.getValue();
-      System.out.println("data: " + parse.getData());
-      System.out.println("text: " + parse.getText() + "\n");
-    }
-  }
-
-  private void addToMap(ParseResult parseResult, SyndFeed feed,
-      String feedLink, SyndEntry entry, Content content) {
-    String link = entry.getLink(), text = null, title = null;
-    Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
-    Parse parse = null;
-    SyndContent description = entry.getDescription();
-
-    try {
-      link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
-
-      if (link != null)
-        link = filters.filter(link);
-    } catch (Exception e) {
-      e.printStackTrace();
-      return;
-    }
-
-    if (link == null)
-      return;
-
-    title = stripTags(entry.getTitleEx());
-
-    if (feedLink != null)
-      parseMeta.set("feed", feedLink);
-
-    addFields(parseMeta, contentMeta, feed, entry);
-
-    // some item descriptions contain markup text in them,
-    // so we temporarily set their content-type to parse them
-    // with another plugin
-    String contentType = contentMeta.get(Response.CONTENT_TYPE);
-
-    if (description != null)
-      text = description.getValue();
-
-    if (text == null) {
-      List<?> contents = entry.getContents();
-      StringBuilder buf = new StringBuilder();
-      for (Object syndContent : contents) {
-        buf.append(((SyndContent) syndContent).getValue());
-      }
-      text = buf.toString();
-    }
-
-    try {
-      Parser parser = parserFactory.getParsers(contentType, link)[0];
-      parse = parser.getParse(
-          new Content(link, link, text.getBytes(), contentType, contentMeta,
-              conf)).get(link);
-    } catch (ParserNotFound e) { /* ignore */
-    }
-
-    if (parse != null) {
-      ParseData data = parse.getData();
-      data.getContentMeta().remove(Response.CONTENT_TYPE);
-      mergeMetadata(data.getParseMeta(), parseMeta);
-      parseResult.put(link, new ParseText(parse.getText()),
-          new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
-              data.getContentMeta(), data.getParseMeta()));
-    } else {
-      contentMeta.remove(Response.CONTENT_TYPE);
-      parseResult.put(link, new ParseText(text), new ParseData(
-          ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
-          parseMeta));
-    }
-
-  }
-
-  private static String stripTags(SyndContent c) {
-    if (c == null)
-      return "";
-
-    String value = c.getValue();
-
-    String[] parts = value.split("<[^>]*>");
-    StringBuffer buf = new StringBuffer();
-
-    for (String part : parts)
-      buf.append(part);
-
-    return buf.toString().trim();
-  }
-
-  private void addFields(Metadata parseMeta, Metadata contentMeta,
-      SyndFeed feed, SyndEntry entry) {
-    List<?> authors = entry.getAuthors(), categories = entry.getCategories();
-    Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
-    String contentType = null;
-
-    if (authors != null) {
-      for (Object o : authors) {
-        SyndPerson author = (SyndPerson) o;
-        String authorName = author.getName();
-        if (checkString(authorName)) {
-          parseMeta.add(Feed.FEED_AUTHOR, authorName);
-        }
-      }
-    } else {
-      // getAuthors may return null if feed is non-atom
-      // if so, call getAuthor to get Dublin Core module creator.
-      String authorName = entry.getAuthor();
-      if (checkString(authorName)) {
-        parseMeta.set(Feed.FEED_AUTHOR, authorName);
-      }
-    }
-
-    for (Object i : categories) {
-      parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
-    }
-
-    if (published != null) {
-      parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
-    }
-    if (updated != null) {
-      parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
-    }
-
-    SyndContent description = entry.getDescription();
-    if (description != null) {
-      contentType = description.getType();
-    } else {
-      // TODO: What to do if contents.size() > 1?
-      List<?> contents = entry.getContents();
-      if (contents.size() > 0) {
-        contentType = ((SyndContent) contents.get(0)).getType();
-      }
-    }
-
-    if (checkString(contentType)) {
-      // ROME may return content-type as html
-      if (contentType.equals("html"))
-        contentType = "text/html";
-      else if (contentType.equals("xhtml"))
-        contentType = "text/xhtml";
-      contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
-    } else {
-      contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
-    }
-
-  }
-
-  private void mergeMetadata(Metadata first, Metadata second) {
-    for (String name : second.names()) {
-      String[] values = second.getValues(name);
-      for (String value : values) {
-        first.add(name, value);
-      }
-    }
-  }
-
-  private boolean checkString(String s) {
-    return s != null && !s.equals("");
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
deleted file mode 100644
index 3b15968..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse RSS feeds.
- */
-package org.apache.nutch.parse.feed;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
deleted file mode 100644
index 36c8739..0000000
--- a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.feed;
-
-// JDK imports
-import java.util.Iterator;
-import java.util.Map;
-
-import org.junit.Assert;
-import org.junit.Test;
-// APACHE imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolNotFound;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- * 
- * @author mattmann
- * 
- *         Test Suite for the {@link FeedParser}.
- * 
- */
-public class TestFeedParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
-
-  // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/feed/build.xml during plugin compilation.
-
-  private String[] sampleFiles = { "rsstest.rss" };
-
-  public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
-      .getName());
-
-  /**
-   * Calls the {@link FeedParser} on a sample RSS file and checks that there are
-   * 3 {@link ParseResult} entries including the below 2 links:
-   * <ul>
-   * <li>http://www-scf.usc.edu/~mattmann/</li>
-   * <li>http://www.nutch.org</li>
-   * </ul>
-   * 
-   * 
-   * @throws ProtocolNotFound
-   *           If the {@link Protocol}Layer cannot be loaded (required to fetch
-   *           the {@link Content} for the RSS file).
-   * @throws ParseException
-   *           If the {@link Parser}Layer cannot be loaded.
-   */
-  @Test
-  public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    ParseResult parseResult;
-
-    Configuration conf = NutchConfiguration.create();
-    for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-      urlString = urlString.replace('\\', '/');
-
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-
-      parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
-
-      Assert.assertEquals(3, parseResult.size());
-
-      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
-
-      for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
-          .hasNext();) {
-        Map.Entry<Text, Parse> entry = j.next();
-        if (entry.getKey().toString()
-            .equals("http://www-scf.usc.edu/~mattmann/")) {
-          hasLink1 = true;
-        } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
-          hasLink2 = true;
-        } else if (entry.getKey().toString().equals(urlString)) {
-          hasLink3 = true;
-        }
-
-        Assert.assertNotNull(entry.getValue());
-        Assert.assertNotNull(entry.getValue().getData());
-      }
-
-      if (!hasLink1 || !hasLink2 || !hasLink3) {
-        Assert.fail("Outlinks read from sample rss file are not correct!");
-      }
-    }
-
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/headings/build.xml b/src/plugin/headings/build.xml
deleted file mode 100644
index d334ad1..0000000
--- a/src/plugin/headings/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="headings" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/headings/ivy.xml b/src/plugin/headings/ivy.xml
deleted file mode 100644
index 5b8393b..0000000
--- a/src/plugin/headings/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-      <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/headings/plugin.xml b/src/plugin/headings/plugin.xml
deleted file mode 100644
index 0d7921a..0000000
--- a/src/plugin/headings/plugin.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="headings"
-   name="Headings Parse Filter"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-
-   <runtime>
-      <library name="headings.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
-   <extension id="org.apache.nutch.parse.headings"
-              name="Nutch Headings Parse Filter"
-              point="org.apache.nutch.parse.HtmlParseFilter">
-
-      <implementation id="HeadingsParseFilter"
-                      class="org.apache.nutch.parse.headings.HeadingsParseFilter">
-      </implementation>
-
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
deleted file mode 100644
index 657f260..0000000
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.headings;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NodeWalker;
-import org.w3c.dom.*;
-
-/**
- * HtmlParseFilter to retrieve h1 and h2 values from the DOM.
- */
-public class HeadingsParseFilter implements HtmlParseFilter {
-
-  /**
-   * Pattern used to strip surpluss whitespace
-   */
-  protected static Pattern whitespacePattern = Pattern.compile("\\s+");
-
-  private Configuration conf;
-  private String[] headings;
-  private boolean multiValued = false;
-
-  public ParseResult filter(Content content, ParseResult parseResult,
-      HTMLMetaTags metaTags, DocumentFragment doc) {
-    Parse parse = parseResult.get(content.getUrl());
-
-    for (int i = 0; headings != null && i < headings.length; i++) {
-      List<String> discoveredHeadings = getElement(doc, headings[i]);
-
-      if (discoveredHeadings.size() > 0) {
-        for (String heading : discoveredHeadings) {
-          if (heading != null) {
-            heading.trim();
-
-            if (heading.length() > 0) {
-              parse.getData().getParseMeta().add(headings[i], heading);
-            }
-          }
-        }
-      }
-    }
-
-    return parseResult;
-  }
-
-  public void setConf(Configuration conf) {
-    this.conf = conf;
-
-    headings = conf.getStrings("headings");
-    multiValued = conf.getBoolean("headings.multivalued", false);
-  }
-
-  public Configuration getConf() {
-    return this.conf;
-  }
-
-  /**
-   * Finds the specified element and returns its value
-   */
-  protected List<String> getElement(DocumentFragment doc, String element) {
-    List<String> headings = new ArrayList<String>();
-    NodeWalker walker = new NodeWalker(doc);
-
-    while (walker.hasNext()) {
-      Node currentNode = walker.nextNode();
-
-      if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
-        if (element.equalsIgnoreCase(currentNode.getNodeName())) {
-          headings.add(getNodeValue(currentNode));
-
-          // Check for multiValued here, if disabled we don't need
-          // to discover more headings.
-          if (!multiValued) {
-            break;
-          }
-        }
-      }
-    }
-
-    return headings;
-  }
-
-  /**
-   * Returns the text value of the specified Node and child nodes
-   */
-  protected static String getNodeValue(Node node) {
-    StringBuilder buffer = new StringBuilder();
-
-    NodeList children = node.getChildNodes();
-
-    for (int i = 0; i < children.getLength(); i++) {
-      if (children.item(i).getNodeType() == Node.TEXT_NODE) {
-        buffer.append(children.item(i).getNodeValue());
-      }
-    }
-
-    // Return with stripped surplus whitespace
-    Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
-    return matcher.replaceAll(" ").trim();
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
deleted file mode 100644
index 363e0b2..0000000
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.
- */
-package org.apache.nutch.parse.headings;
-

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/build.xml b/src/plugin/index-anchor/build.xml
deleted file mode 100644
index 597b532..0000000
--- a/src/plugin/index-anchor/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-anchor" default="jar-core">
-
-  <import file="../build-plugin.xml" />
-
-</project>
\ No newline at end of file