You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:49:06 UTC
[22/69] [abbrv] [partial] nutch git commit: Re arranged the source
code as per maven conventions for build
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
deleted file mode 100755
index 75ae2e7..0000000
--- a/src/plugin/build.xml
+++ /dev/null
@@ -1,213 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="Nutch" default="deploy-core" basedir=".">
-
- <target name="deploy-core">
- <ant target="compile-core" inheritall="false" dir="../.."/>
- <ant target="deploy"/>
- </target>
-
- <!-- ====================================================== -->
- <!-- Build & deploy all the plugin jars. -->
- <!-- ====================================================== -->
- <target name="deploy">
- <ant dir="creativecommons" target="deploy"/>
- <ant dir="feed" target="deploy"/>
- <ant dir="headings" target="deploy"/>
- <ant dir="index-basic" target="deploy"/>
- <ant dir="index-anchor" target="deploy"/>
- <ant dir="index-geoip" target="deploy"/>
- <ant dir="index-more" target="deploy"/>
- <ant dir="index-replace" target="deploy"/>
- <ant dir="index-static" target="deploy"/>
- <ant dir="index-metadata" target="deploy"/>
- <ant dir="index-links" target="deploy"/>
- <ant dir="mimetype-filter" target="deploy"/>
- <ant dir="indexer-cloudsearch" target="deploy"/>
- <ant dir="indexer-dummy" target="deploy"/>
- <ant dir="indexer-elastic" target="deploy"/>
- <ant dir="indexer-solr" target="deploy"/>
- <ant dir="language-identifier" target="deploy"/>
- <ant dir="lib-http" target="deploy"/>
- <ant dir="lib-nekohtml" target="deploy"/>
- <ant dir="lib-regex-filter" target="deploy"/>
- <ant dir="lib-xml" target="deploy"/>
- <ant dir="microformats-reltag" target="deploy"/>
- <ant dir="nutch-extensionpoints" target="deploy"/>
- <ant dir="protocol-file" target="deploy"/>
- <ant dir="protocol-ftp" target="deploy"/>
- <ant dir="protocol-http" target="deploy"/>
- <ant dir="protocol-httpclient" target="deploy"/>
- <ant dir="lib-htmlunit" target="deploy"/>
- <ant dir="protocol-htmlunit" target="deploy" />
- <ant dir="lib-selenium" target="deploy"/>
- <ant dir="protocol-selenium" target="deploy" />
- <ant dir="protocol-interactiveselenium" target="deploy" />
- <ant dir="parse-ext" target="deploy"/>
- <ant dir="parse-js" target="deploy"/>
- <ant dir="parse-html" target="deploy"/>
- <ant dir="parse-metatags" target="deploy"/>
- <ant dir="parse-swf" target="deploy"/>
- <ant dir="parse-tika" target="deploy"/>
- <ant dir="parse-zip" target="deploy"/>
- <ant dir="scoring-depth" target="deploy"/>
- <ant dir="scoring-opic" target="deploy"/>
- <ant dir="scoring-link" target="deploy"/>
- <ant dir="scoring-similarity" target="deploy"/>
- <ant dir="subcollection" target="deploy"/>
- <ant dir="tld" target="deploy"/>
- <ant dir="urlfilter-automaton" target="deploy"/>
- <ant dir="urlfilter-domain" target="deploy" />
- <ant dir="urlfilter-domainblacklist" target="deploy" />
- <ant dir="urlfilter-prefix" target="deploy"/>
- <ant dir="urlfilter-regex" target="deploy"/>
- <ant dir="urlfilter-suffix" target="deploy"/>
- <ant dir="urlfilter-validator" target="deploy"/>
- <ant dir="urlfilter-ignoreexempt" target="deploy"/>
- <ant dir="parsefilter-naivebayes" target="deploy"/>
- <ant dir="parsefilter-regex" target="deploy"/>
- <ant dir="urlmeta" target="deploy"/>
- <ant dir="urlnormalizer-ajax" target="deploy"/>
- <ant dir="urlnormalizer-basic" target="deploy"/>
- <ant dir="urlnormalizer-host" target="deploy"/>
- <ant dir="urlnormalizer-pass" target="deploy"/>
- <ant dir="urlnormalizer-protocol" target="deploy"/>
- <ant dir="urlnormalizer-querystring" target="deploy"/>
- <ant dir="urlnormalizer-regex" target="deploy"/>
- <ant dir="urlnormalizer-slash" target="deploy"/>
- </target>
-
- <!-- ====================================================== -->
- <!-- Test all of the plugins. -->
- <!-- ====================================================== -->
- <target name="test">
- <parallel threadCount="2">
- <ant dir="creativecommons" target="test"/>
- <ant dir="index-basic" target="test"/>
- <ant dir="index-anchor" target="test"/>
- <ant dir="index-geoip" target="test"/>
- <ant dir="index-more" target="test"/>
- <ant dir="index-static" target="test"/>
- <ant dir="index-replace" target="test"/>
- <ant dir="index-links" target="test"/>
- <ant dir="mimetype-filter" target="test"/>
- <ant dir="language-identifier" target="test"/>
- <ant dir="lib-http" target="test"/>
- <ant dir="protocol-file" target="test"/>
- <ant dir="protocol-http" target="test"/>
- <ant dir="protocol-httpclient" target="test"/>
- <!--ant dir="parse-ext" target="test"/-->
- <ant dir="feed" target="test"/>
- <ant dir="parse-html" target="test"/>
- <ant dir="parse-metatags" target="test"/>
- <ant dir="parse-swf" target="test"/>
- <ant dir="parse-tika" target="test"/>
- <ant dir="parse-zip" target="test"/>
- <ant dir="parsefilter-regex" target="test"/>
- <ant dir="subcollection" target="test"/>
- <ant dir="urlfilter-automaton" target="test"/>
- <ant dir="urlfilter-domain" target="test"/>
- <ant dir="urlfilter-domainblacklist" target="test"/>
- <ant dir="urlfilter-prefix" target="test"/>
- <ant dir="urlfilter-regex" target="test"/>
- <ant dir="urlfilter-suffix" target="test"/>
- <ant dir="urlfilter-validator" target="test"/>
- <ant dir="urlfilter-ignoreexempt" target="test"/>
- <ant dir="urlnormalizer-ajax" target="test"/>
- <ant dir="urlnormalizer-basic" target="test"/>
- <ant dir="urlnormalizer-host" target="test"/>
- <ant dir="urlnormalizer-pass" target="test"/>
- <ant dir="urlnormalizer-protocol" target="test"/>
- <ant dir="urlnormalizer-querystring" target="test"/>
- <ant dir="urlnormalizer-regex" target="test"/>
- <ant dir="urlnormalizer-slash" target="test"/>
- </parallel>
- </target>
-
- <!-- ====================================================== -->
- <!-- Clean all of the plugins. -->
- <!-- ====================================================== -->
- <target name="clean">
- <ant dir="creativecommons" target="clean"/>
- <ant dir="feed" target="clean"/>
- <ant dir="headings" target="clean"/>
- <ant dir="index-basic" target="clean"/>
- <ant dir="index-anchor" target="clean"/>
- <ant dir="index-geoip" target="clean"/>
- <ant dir="index-more" target="clean"/>
- <ant dir="index-static" target="clean"/>
- <ant dir="index-replace" target="clean"/>
- <ant dir="index-metadata" target="clean"/>
- <ant dir="index-links" target="clean"/>
- <ant dir="mimetype-filter" target="clean"/>
- <ant dir="indexer-cloudsearch" target="clean"/>
- <ant dir="indexer-dummy" target="clean"/>
- <ant dir="indexer-elastic" target="clean"/>
- <ant dir="indexer-solr" target="clean"/>
- <ant dir="language-identifier" target="clean"/>
- <!-- <ant dir="lib-commons-httpclient" target="clean"/> -->
- <ant dir="lib-http" target="clean"/>
- <!-- <ant dir="lib-lucene-analyzers" target="clean"/>-->
- <ant dir="lib-nekohtml" target="clean"/>
- <ant dir="lib-regex-filter" target="clean"/>
- <ant dir="lib-xml" target="clean"/>
- <ant dir="microformats-reltag" target="clean"/>
- <ant dir="nutch-extensionpoints" target="clean"/>
- <ant dir="protocol-file" target="clean"/>
- <ant dir="protocol-ftp" target="clean"/>
- <ant dir="protocol-http" target="clean"/>
- <ant dir="protocol-httpclient" target="clean"/>
- <ant dir="lib-htmlunit" target="clean"/>
- <ant dir="protocol-htmlunit" target="clean" />
- <ant dir="lib-selenium" target="clean"/>
- <ant dir="protocol-selenium" target="clean" />
- <ant dir="protocol-interactiveselenium" target="clean" />
- <ant dir="parse-ext" target="clean"/>
- <ant dir="parse-js" target="clean"/>
- <ant dir="parse-html" target="clean"/>
- <ant dir="parse-metatags" target="clean"/>
- <ant dir="parse-swf" target="clean"/>
- <ant dir="parse-tika" target="clean"/>
- <ant dir="parse-zip" target="clean"/>
- <ant dir="parsefilter-regex" target="clean"/>
- <ant dir="scoring-depth" target="clean"/>
- <ant dir="scoring-opic" target="clean"/>
- <ant dir="scoring-link" target="clean"/>
- <ant dir="scoring-similarity" target="clean"/>
- <ant dir="subcollection" target="clean"/>
- <ant dir="tld" target="clean"/>
- <ant dir="urlfilter-automaton" target="clean"/>
- <ant dir="urlfilter-domain" target="clean" />
- <ant dir="urlfilter-domainblacklist" target="clean" />
- <ant dir="urlfilter-prefix" target="clean"/>
- <ant dir="urlfilter-regex" target="clean"/>
- <ant dir="urlfilter-suffix" target="clean"/>
- <ant dir="urlfilter-validator" target="clean"/>
- <ant dir="urlfilter-ignoreexempt" target="clean"/>
- <ant dir="parsefilter-naivebayes" target="clean" />
- <ant dir="urlmeta" target="clean"/>
- <ant dir="urlnormalizer-ajax" target="clean"/>
- <ant dir="urlnormalizer-basic" target="clean"/>
- <ant dir="urlnormalizer-host" target="clean"/>
- <ant dir="urlnormalizer-pass" target="clean"/>
- <ant dir="urlnormalizer-protocol" target="clean"/>
- <ant dir="urlnormalizer-querystring" target="clean"/>
- <ant dir="urlnormalizer-regex" target="clean"/>
- <ant dir="urlnormalizer-slash" target="clean"/>
- </target>
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/README.txt
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/README.txt b/src/plugin/creativecommons/README.txt
deleted file mode 100644
index d4d7b65..0000000
--- a/src/plugin/creativecommons/README.txt
+++ /dev/null
@@ -1 +0,0 @@
-Support for crawling and searching Creative-Commons licensed content.
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/build.xml b/src/plugin/creativecommons/build.xml
deleted file mode 100755
index 6443d7f..0000000
--- a/src/plugin/creativecommons/build.xml
+++ /dev/null
@@ -1,28 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="creativecommons" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
- <!-- <ant target="deploy" inheritall="false" dir="../parse-html"/> -->
- </target>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/conf/crawl-urlfilter.txt
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/conf/crawl-urlfilter.txt b/src/plugin/creativecommons/conf/crawl-urlfilter.txt
deleted file mode 100644
index 324617f..0000000
--- a/src/plugin/creativecommons/conf/crawl-urlfilter.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-# Creative Commnons crawl filter
-
-# Each non-comment, non-blank line contains a regular expression
-# prefixed by '+' or '-'. The first matching pattern in the file
-# determines whether a URL is included or ignored. If no pattern
-# matches, the URL is ignored.
-
-# skip file:, ftp:, & mailto: urls
--^(file|ftp|mailto|https):
-
-# skip image and other suffixes we can't yet parse
--\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|rtf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|mp3|rss|xml|doc|pdf|txt|DOC|PDF|TXT)$
-
-# skip URLs containing certain characters as probable queries, etc.
--[?*!@=]
-
-# accept anything else
-+.
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/conf/nutch-site.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/conf/nutch-site.xml b/src/plugin/creativecommons/conf/nutch-site.xml
deleted file mode 100644
index 71e344b..0000000
--- a/src/plugin/creativecommons/conf/nutch-site.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-<?xml version="1.0"?>
-<?xml-stylesheet type="text/xsl" href="nutch-conf.xsl"?>
-
-<!-- Creative Commons' Nutch configuration -->
-
-<nutch-conf>
-
-<property>
- <name>http.agent.name</name>
- <value>CreativeCommons</value>
- <description>Our HTTP 'User-Agent' request header.</description>
-</property>
-
-<property>
- <name>http.robots.agents</name>
- <value>CreativeCommons,Nutch,*</value>
- <description>The agent strings we'll look for in robots.txt files,
- comma-separated, in decreasing order of precedence.</description>
-</property>
-
-<property>
- <name>fetcher.server.delay</name>
- <value>2.0</value>
- <description>We need to be more polite than when crawling an
- intranet that we control.</description>
-</property>
-
-<property>
- <name>http.max.delays</name>
- <value>3</value>
- <description>The CC crawl visits a large number of different
- hosts, so we should not need to delay much.</description>
-</property>
-
-<property>
- <name>creativecommons.exclude.unlicensed</name>
- <value>true</value>
- <description>Exclude HTML content which does not contain a CC license.
- </description>
-</property>
-
-<property>
- <name>plugin.excludes</name>
- <value>parse-(?!html).*</value>
- <description>Exclude non-HTML content, since we don't know how to
- find a CC license in anything but HTML.
- </description>
-</property>
-
-</nutch-conf>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/anchor.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/data/anchor.html b/src/plugin/creativecommons/data/anchor.html
deleted file mode 100755
index 90b5227..0000000
--- a/src/plugin/creativecommons/data/anchor.html
+++ /dev/null
@@ -1,9 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/1999/REC-html401-19991224/loose.dtd">
-<html>
-<head>
-</head>
-<body>
-<p><a href="http://creativecommons.org/licenses/by-nc-sa/1.0"><img alt="Creative Commons License" src="http://creativecommons.org/images/public/somerights.gif" align="right"></a>This file is licensed under a
-<a href="http://creativecommons.org/licenses/by-nc-sa/1.0">Creative Commons License</a>.</p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/rdf.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/data/rdf.html b/src/plugin/creativecommons/data/rdf.html
deleted file mode 100755
index fb2c34d..0000000
--- a/src/plugin/creativecommons/data/rdf.html
+++ /dev/null
@@ -1,35 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
- <head>
- </head>
- <body>
-
-<!-- Creative Commons License -->
-<p><a href="http://creativecommons.org/licenses/by-nc/1.0"><img alt="Creative Commons License" border="0" src="http://creativecommons.org/images/public/somerights.gif" /></a><br />
-This work is licensed under a
-<a href="http://creativecommons.org/licenses/by-nc/1.0">Creative Commons License</a>.
-<!-- end Creative Commons License -->
-
- <!--
-<rdf:RDF xmlns="http://web.resource.org/cc/"
- xmlns:dc="http://purl.org/dc/elements/1.1/"
- xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
-<Work rdf:about="http://boingboing.net">
- <dc:type rdf:resource="http://purl.org/dc/dcmitype/Text" />
- <license rdf:resource="http://creativecommons.org/licenses/by-nc/1.0" />
-</Work>
-
-<License rdf:about="http://creativecommons.org/licenses/by-nc/1.0">
- <requires rdf:resource="http://web.resource.org/cc/Attribution" />
- <permits rdf:resource="http://web.resource.org/cc/DerivativeWorks" />
- <permits rdf:resource="http://web.resource.org/cc/Reproduction" />
- <permits rdf:resource="http://web.resource.org/cc/Distribution" />
- <prohibits rdf:resource="http://web.resource.org/cc/CommercialUse" />
- <requires rdf:resource="http://web.resource.org/cc/Notice" />
-</License>
-
-</rdf:RDF>
-
--->
- </body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/data/rel.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/data/rel.html b/src/plugin/creativecommons/data/rel.html
deleted file mode 100755
index 413d52f..0000000
--- a/src/plugin/creativecommons/data/rel.html
+++ /dev/null
@@ -1,6 +0,0 @@
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml" lang="en"><head>
-</head><body>
-<a rel="license" href="http://creativecommons.org/licenses/by-nc/2.0">CC by-nc</a>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/ivy.xml b/src/plugin/creativecommons/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/creativecommons/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/plugin.xml b/src/plugin/creativecommons/plugin.xml
deleted file mode 100755
index de9cf36..0000000
--- a/src/plugin/creativecommons/plugin.xml
+++ /dev/null
@@ -1,48 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="creativecommons"
- name="Creative Commons Plugins"
- version="1.0.0"
- provider-name="nutch.org">
-
- <runtime>
- <library name="creativecommons.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.creativecommons.nutch.CCParseFilter"
- name="Creative Commons Metadata Filter"
- point="org.apache.nutch.parse.HtmlParseFilter">
- <implementation id="CCParseFilter"
- class="org.creativecommons.nutch.CCParseFilter"/>
- </extension>
-
- <extension id="org.creativecommons.nutch.CCIndexingFilter"
- name="Creative Commons Indexing Filter"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="CCIndexingFilter"
- class="org.creativecommons.nutch.CCIndexingFilter"/>
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
deleted file mode 100644
index e7c55c4..0000000
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.CreativeCommons;
-
-import org.apache.nutch.parse.Parse;
-
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.hadoop.io.Text;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.metadata.Metadata;
-
-import org.apache.hadoop.conf.Configuration;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.*;
-import java.net.URL;
-import java.net.MalformedURLException;
-
-/** Adds basic searchable fields to a document. */
-public class CCIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory
- .getLogger(CCIndexingFilter.class);
-
- /** The name of the document field we use. */
- public static String FIELD = "cc";
-
- private Configuration conf;
-
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
-
- Metadata metadata = parse.getData().getParseMeta();
- // index the license
- String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
- if (licenseUrl != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: indexing " + licenseUrl + " for: " + url.toString());
- }
-
- // add the entire license as cc:license=xxx
- addFeature(doc, "license=" + licenseUrl);
-
- // index license attributes extracted of the license url
- addUrlFeatures(doc, licenseUrl);
- }
-
- // index the license location as cc:meta=xxx
- String licenseLocation = metadata.get(CreativeCommons.LICENSE_LOCATION);
- if (licenseLocation != null) {
- addFeature(doc, "meta=" + licenseLocation);
- }
-
- // index the work type cc:type=xxx
- String workType = metadata.get(CreativeCommons.WORK_TYPE);
- if (workType != null) {
- addFeature(doc, workType);
- }
-
- return doc;
- }
-
- /**
- * Add the features represented by a license URL. Urls are of the form
- * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
- * license feature.
- */
- public void addUrlFeatures(NutchDocument doc, String urlString) {
- try {
- URL url = new URL(urlString);
-
- // tokenize the path of the url, breaking at slashes and dashes
- StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
-
- if (names.hasMoreTokens())
- names.nextToken(); // throw away "licenses"
-
- // add a feature per component after "licenses"
- while (names.hasMoreTokens()) {
- String feature = names.nextToken();
- addFeature(doc, feature);
- }
- } catch (MalformedURLException e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: failed to parse url: " + urlString + " : " + e);
- }
- }
- }
-
- private void addFeature(NutchDocument doc, String feature) {
- doc.add(FIELD, feature);
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
deleted file mode 100644
index 1fa951e..0000000
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ /dev/null
@@ -1,300 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.CreativeCommons;
-import org.apache.nutch.parse.*;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.hadoop.conf.Configuration;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.util.*;
-import java.io.*;
-import java.net.*;
-import javax.xml.parsers.*;
-import org.xml.sax.InputSource;
-import org.w3c.dom.*;
-
-/** Adds metadata identifying the Creative Commons license used, if any. */
-public class CCParseFilter implements HtmlParseFilter {
- public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
-
- /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
- public static class Walker {
- private URL base; // base url of page
- private String rdfLicense; // subject url found, if any
- private URL relLicense; // license url found, if any
- private URL anchorLicense; // anchor url found, if any
- private String workType; // work type URI
-
- private Walker(URL base) {
- this.base = base;
- }
-
- /** Scan the document adding attributes to metadata. */
- public static void walk(Node doc, URL base, Metadata metadata,
- Configuration conf) throws ParseException {
-
- // walk the DOM tree, scanning for license data
- Walker walker = new Walker(base);
- walker.walk(doc);
-
- // interpret results of walk
- String licenseUrl = null;
- String licenseLocation = null;
- if (walker.rdfLicense != null) { // 1st choice: subject in RDF
- licenseLocation = "rdf";
- licenseUrl = walker.rdfLicense;
- } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
- licenseLocation = "rel";
- licenseUrl = walker.relLicense.toString();
- } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
- licenseLocation = "a";
- licenseUrl = walker.anchorLicense.toString();
- } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
- throw new ParseException("No CC license. Excluding.");
- }
-
- // add license to metadata
- if (licenseUrl != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: found " + licenseUrl + " in " + licenseLocation
- + " of " + base);
- }
- metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
- metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
- }
-
- if (walker.workType != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("CC: found " + walker.workType + " in " + base);
- }
- metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
- }
-
- }
-
- /** Scan the document looking for RDF in comments and license elements. */
- private void walk(Node node) {
-
- // check element nodes for license URL
- if (node instanceof Element) {
- findLicenseUrl((Element) node);
- }
-
- // check comment nodes for license RDF
- if (node instanceof Comment) {
- findRdf(((Comment) node).getData());
- }
-
- // recursively walk child nodes
- NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++) {
- walk(children.item(i));
- }
- }
-
- /**
- * Extract license url from element, if any. Thse are the href attribute of
- * anchor elements with rel="license". These must also point to
- * http://creativecommons.org/licenses/.
- */
- private void findLicenseUrl(Element element) {
- // only look in Anchor elements
- if (!"a".equalsIgnoreCase(element.getTagName()))
- return;
-
- // require an href
- String href = element.getAttribute("href");
- if (href == null)
- return;
-
- try {
- URL url = new URL(base, href); // resolve the url
-
- // check that it's a CC license URL
- if ("http".equalsIgnoreCase(url.getProtocol())
- && "creativecommons.org".equalsIgnoreCase(url.getHost())
- && url.getPath() != null && url.getPath().startsWith("/licenses/")
- && url.getPath().length() > "/licenses/".length()) {
-
- // check rel="license"
- String rel = element.getAttribute("rel");
- if (rel != null && "license".equals(rel) && this.relLicense == null) {
- this.relLicense = url; // found rel license
- } else if (this.anchorLicense == null) {
- this.anchorLicense = url; // found anchor license
- }
- }
- } catch (MalformedURLException e) { // ignore malformed urls
- }
- }
-
- /** Configure a namespace aware XML parser. */
- private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
- .newInstance();
- static {
- FACTORY.setNamespaceAware(true);
- }
-
- /** Creative Commons' namespace URI. */
- private static final String CC_NS = "http://web.resource.org/cc/";
-
- /** Dublin Core namespace URI. */
- private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
-
- /** RDF syntax namespace URI. */
- private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
-
- private void findRdf(String comment) {
- // first check for likely RDF in comment
- int rdfPosition = comment.indexOf("RDF");
- if (rdfPosition < 0)
- return; // no RDF, abort
- int nsPosition = comment.indexOf(CC_NS);
- if (nsPosition < 0)
- return; // no RDF, abort
-
- // try to parse the XML
- Document doc;
- try {
- DocumentBuilder parser = FACTORY.newDocumentBuilder();
- doc = parser.parse(new InputSource(new StringReader(comment)));
- } catch (Exception e) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
- }
- // e.printStackTrace();
- return;
- }
-
- // check that root is rdf:RDF
- NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
- if (roots.getLength() != 1) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("CC: No RDF root in " + base);
- }
- return;
- }
- Element rdf = (Element) roots.item(0);
-
- // get cc:License nodes inside rdf:RDF
- NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
- for (int i = 0; i < licenses.getLength(); i++) {
-
- Element l = (Element) licenses.item(i);
-
- // license is rdf:about= attribute from cc:License
- this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
-
- // walk predicates of cc:License
- NodeList predicates = l.getChildNodes();
- for (int j = 0; j < predicates.getLength(); j++) {
- Node predicateNode = predicates.item(j);
- if (!(predicateNode instanceof Element))
- continue;
- Element predicateElement = (Element) predicateNode;
-
- // extract predicates of cc:xxx predicates
- if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
- continue;
- }
-
- // add object and predicate to metadata
- // metadata.put(object, predicate);
- // if (LOG.isInfoEnabled()) {
- // LOG.info("CC: found: "+predicate+"="+object);
- // }
- }
- }
-
- // get cc:Work nodes from rdf:RDF
- NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");
- for (int i = 0; i < works.getLength(); i++) {
- // get dc:type nodes from cc:Work
- NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
-
- for (int j = 0; j < types.getLength(); j++) {
- Element type = (Element) types.item(j);
- String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
- .getValue();
- this.workType = WORK_TYPE_NAMES.get(workUri);
- }
- }
- }
- }
-
- private static final HashMap<String, String> WORK_TYPE_NAMES = new HashMap<String, String>();
- static {
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
- "interactive");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
- }
-
- private Configuration conf;
-
- /**
- * Adds metadata or otherwise modifies a parse of an HTML document, given the
- * DOM tree of a page.
- */
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
-
- // get parse obj
- Parse parse = parseResult.get(content.getUrl());
-
- // construct base url
- URL base;
- try {
- base = new URL(content.getBaseUrl());
- } catch (MalformedURLException e) {
- Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
- parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
- emptyParse.getData());
- return parseResult;
- }
-
- try {
- // extract license metadata
- Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
- } catch (ParseException e) {
- Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
- parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
- emptyParse.getData());
- return parseResult;
- }
-
- return parseResult;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html
deleted file mode 100644
index 0c91293..0000000
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Sample plugins that parse and index Creative Commons medadata.</p>
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java b/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
deleted file mode 100755
index 41be9ed..0000000
--- a/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.creativecommons.nutch;
-
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.junit.Assert;
-import org.junit.Test;
-
-import java.io.*;
-
-public class TestCCParseFilter {
-
- private static final File testDir = new File(System.getProperty("test.input"));
-
- @Test
- public void testPages() throws Exception {
- pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
- // Tika returns <a> whereas parse-html returns <rel>
- // check later
- pageTest(new File(testDir, "rel.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
- // Tika returns <a> whereas parse-html returns <rdf>
- // check later
- pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
- }
-
- public void pageTest(File file, String url, String license, String location,
- String type) throws Exception {
-
- String contentType = "text/html";
- InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
- byte[] buffer = new byte[1024];
- int i;
- while ((i = in.read(buffer)) != -1) {
- out.write(buffer, 0, i);
- }
- in.close();
- byte[] bytes = out.toByteArray();
- Configuration conf = NutchConfiguration.create();
-
- Content content = new Content(url, url, bytes, contentType, new Metadata(),
- conf);
- Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
- Metadata metadata = parse.getData().getParseMeta();
- Assert.assertEquals(license, metadata.get("License-Url"));
- Assert.assertEquals(location, metadata.get("License-Location"));
- Assert.assertEquals(type, metadata.get("Work-Type"));
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/feed/build.xml b/src/plugin/feed/build.xml
deleted file mode 100644
index 7fe7050..0000000
--- a/src/plugin/feed/build.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<project name="feed" default="jar-core">
-
- <import file="../build-plugin.xml" />
-
- <!-- Build compilation dependencies -->
- <target name="deps-jar">
- <ant target="jar" inheritall="false" dir="../lib-xml"/>
- </target>
-
- <!-- Add compilation dependencies to classpath -->
- <path id="plugin.deps">
- <fileset dir="${nutch.root}/build">
- <include name="**/lib-xml/*.jar" />
- </fileset>
- </path>
-
- <!-- Deploy Unit test dependencies -->
- <target name="deps-test">
- <ant target="deploy" inheritall="false"
- dir="../nutch-extensionpoints" />
- <ant target="deploy" inheritall="false" dir="../protocol-file" />
- </target>
-
- <!-- for junit test -->
- <mkdir dir="${build.test}/data" />
- <copy file="sample/rsstest.rss" todir="${build.test}/data" />
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/feed/ivy.xml b/src/plugin/feed/ivy.xml
deleted file mode 100644
index c29bd03..0000000
--- a/src/plugin/feed/ivy.xml
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="../../..//ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- <dependency org="rome" name="rome" rev="0.9" conf="*->master"/>
- <dependency org="org.jdom" name="jdom" rev="1.1" conf="*->master"/>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/feed/plugin.xml b/src/plugin/feed/plugin.xml
deleted file mode 100644
index 3a68d8d..0000000
--- a/src/plugin/feed/plugin.xml
+++ /dev/null
@@ -1,49 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin id="feed" name="Feed Parse/Index/Query Plug-in" version="1.0.0"
- provider-name="nutch.org">
- <runtime>
- <library name="feed.jar">
- <export name="*" />
- </library>
- <library name="rome-0.9.jar" />
- <library name="jdom-1.1.jar" />
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints" />
- <import plugin="lib-xml" />
- </requires>
-
- <extension id="org.apache.nutch.parse.feed" name="Feed Parser"
- point="org.apache.nutch.parse.Parser">
-
- <implementation id="org.apache.nutch.parse.feed.FeedParser"
- class="org.apache.nutch.parse.feed.FeedParser">
- <parameter name="contentType" value="application/rss+xml" />
- <parameter name="contentType" value="application/atom+xml" />
- <parameter name="contentType" value="text/xml" />
- <parameter name="pathSuffix" value="rss" />
- </implementation>
- </extension>
- <extension id="org.apache.nutch.indexer.feed" name="Feed Indexer"
- point="org.apache.nutch.indexer.IndexingFilter">
- <implementation id="FeedIndexingFilter"
- class="org.apache.nutch.indexer.feed.FeedIndexingFilter" />
- </extension>
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/sample/rsstest.rss
----------------------------------------------------------------------
diff --git a/src/plugin/feed/sample/rsstest.rss b/src/plugin/feed/sample/rsstest.rss
deleted file mode 100644
index 758f6a1..0000000
--- a/src/plugin/feed/sample/rsstest.rss
+++ /dev/null
@@ -1,36 +0,0 @@
-<?xml version="1.0" encoding="ISO-8859-1" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<rss version="0.91">
- <channel>
- <title>TestChannel</title>
- <link>http://test.channel.com/</link>
- <description>Sample RSS File for Junit test</description>
- <language>en-us</language>
-
- <item>
- <title>Home Page of Chris Mattmann</title>
- <link>http://www-scf.usc.edu/~mattmann/</link>
- <description>Chris Mattmann's home page</description>
- </item>
- <item>
- <title>Awesome Open Source Search Engine</title>
- <link>http://www.nutch.org/</link>
- <description>Yup, that's what it is</description>
- </item>
- </channel>
-</rss>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
deleted file mode 100644
index 94b440a..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
+++ /dev/null
@@ -1,129 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.indexer.feed;
-
-//JDK imports
-import java.util.Date;
-
-//APACHE imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilter;
-import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.Feed;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-
-/**
- * @author dogacan
- * @author mattmann
- * @since NUTCH-444
- *
- * An {@link IndexingFilter} implementation to pull out the relevant
- * extracted {@link Metadata} fields from the RSS feeds and into the
- * index.
- *
- */
-public class FeedIndexingFilter implements IndexingFilter {
-
- public static final String dateFormatStr = "yyyyMMddHHmm";
-
- private Configuration conf;
-
- private final static String PUBLISHED_DATE = "publishedDate";
-
- private final static String UPDATED_DATE = "updatedDate";
-
- /**
- * Extracts out the relevant fields:
- *
- * <ul>
- * <li>FEED_AUTHOR</li>
- * <li>FEED_TAGS</li>
- * <li>FEED_PUBLISHED</li>
- * <li>FEED_UPDATED</li>
- * <li>FEED</li>
- * </ul>
- *
- * And sends them to the {@link Indexer} for indexing within the Nutch index.
- *
- */
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
- CrawlDatum datum, Inlinks inlinks) throws IndexingException {
- ParseData parseData = parse.getData();
- Metadata parseMeta = parseData.getParseMeta();
-
- String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
- String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
- String published = parseMeta.get(Feed.FEED_PUBLISHED);
- String updated = parseMeta.get(Feed.FEED_UPDATED);
- String feed = parseMeta.get(Feed.FEED);
-
- if (authors != null) {
- for (String author : authors) {
- doc.add(Feed.FEED_AUTHOR, author);
- }
- }
-
- if (tags != null) {
- for (String tag : tags) {
- doc.add(Feed.FEED_TAGS, tag);
- }
- }
-
- if (feed != null)
- doc.add(Feed.FEED, feed);
-
- if (published != null) {
- Date date = new Date(Long.parseLong(published));
- doc.add(PUBLISHED_DATE, date);
- }
-
- if (updated != null) {
- Date date = new Date(Long.parseLong(updated));
- doc.add(UPDATED_DATE, date);
- }
-
- return doc;
- }
-
- /**
- * @return the {@link Configuration} object used to configure this
- * {@link IndexingFilter}.
- */
- public Configuration getConf() {
- return conf;
- }
-
- /**
- * Sets the {@link Configuration} object used to configure this
- * {@link IndexingFilter}.
- *
- * @param conf
- * The {@link Configuration} object used to configure this
- * {@link IndexingFilter}.
- */
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
deleted file mode 100644
index 8f52628..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Indexing filter to index meta data from RSS feeds.
- */
-package org.apache.nutch.indexer.feed;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
deleted file mode 100644
index 936c885..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
+++ /dev/null
@@ -1,374 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.parse.feed;
-
-// JDK imports
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map.Entry;
-
-// APACHE imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.util.StringUtils;
-// import org.apache.nutch.indexer.anchor.AnchorIndexingFilter; removed as per NUTCH-1078
-import org.apache.nutch.metadata.Feed;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.URLFilters;
-import org.apache.nutch.net.URLNormalizers;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.parse.Outlink;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseStatus;
-import org.apache.nutch.parse.ParseText;
-import org.apache.nutch.parse.Parser;
-import org.apache.nutch.parse.ParserFactory;
-import org.apache.nutch.parse.ParserNotFound;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.EncodingDetector;
-import org.apache.nutch.util.NutchConfiguration;
-import org.xml.sax.InputSource;
-
-// ROME imports
-import com.sun.syndication.feed.synd.SyndCategory;
-import com.sun.syndication.feed.synd.SyndContent;
-import com.sun.syndication.feed.synd.SyndEntry;
-import com.sun.syndication.feed.synd.SyndFeed;
-import com.sun.syndication.feed.synd.SyndPerson;
-import com.sun.syndication.io.SyndFeedInput;
-
-/**
- *
- * @author dogacan
- * @author mattmann
- * @since NUTCH-444
- *
- * <p>
- * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
- * links and content present in the feed.
- * </p>
- *
- */
-public class FeedParser implements Parser {
-
- public static final String CHARSET_UTF8 = "charset=UTF-8";
-
- public static final String TEXT_PLAIN_CONTENT_TYPE = "text/plain; "
- + CHARSET_UTF8;
-
- public static final Logger LOG = LoggerFactory.getLogger(FeedParser.class);
-
- private Configuration conf;
-
- private ParserFactory parserFactory;
-
- private URLNormalizers normalizers;
-
- private URLFilters filters;
-
- private String defaultEncoding;
-
- /**
- * Parses the given feed and extracts out and parsers all linked items within
- * the feed, using the underlying ROME feed parsing library.
- *
- * @param content
- * A {@link Content} object representing the feed that is being
- * parsed by this {@link Parser}.
- *
- * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
- * present in the feed file that this {@link Parser} dealt with.
- *
- */
- public ParseResult getParse(Content content) {
- SyndFeed feed = null;
- ParseResult parseResult = new ParseResult(content.getUrl());
-
- EncodingDetector detector = new EncodingDetector(conf);
- detector.autoDetectClues(content, true);
- String encoding = detector.guessEncoding(content, defaultEncoding);
- try {
- InputSource input = new InputSource(new ByteArrayInputStream(
- content.getContent()));
- input.setEncoding(encoding);
- SyndFeedInput feedInput = new SyndFeedInput();
- feed = feedInput.build(input);
- } catch (Exception e) {
- // return empty parse
- LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: "
- + StringUtils.stringifyException(e));
- return new ParseStatus(e)
- .getEmptyParseResult(content.getUrl(), getConf());
- }
-
- String feedLink = feed.getLink();
- try {
- feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
- if (feedLink != null)
- feedLink = filters.filter(feedLink);
- } catch (Exception e) {
- feedLink = null;
- }
-
- List<?> entries = feed.getEntries();
- for (Object entry : entries) {
- addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
- }
-
- String feedDesc = stripTags(feed.getDescriptionEx());
- String feedTitle = stripTags(feed.getTitleEx());
-
- parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(
- new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0],
- content.getMetadata()));
-
- return parseResult;
- }
-
- /**
- *
- * Sets the {@link Configuration} object for this {@link Parser}. This
- * {@link Parser} expects the following configuration properties to be set:
- *
- * <ul>
- * <li>URLNormalizers - properties in the configuration object to set up the
- * default url normalizers.</li>
- * <li>URLFilters - properties in the configuration object to set up the
- * default url filters.</li>
- * </ul>
- *
- * @param conf
- * The Hadoop {@link Configuration} object to use to configure this
- * {@link Parser}.
- *
- */
- public void setConf(Configuration conf) {
- this.conf = conf;
- this.parserFactory = new ParserFactory(conf);
- this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
- this.filters = new URLFilters(conf);
- this.defaultEncoding = conf.get("parser.character.encoding.default",
- "windows-1252");
- }
-
- /**
- *
- * @return The {@link Configuration} object used to configure this
- * {@link Parser}.
- */
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * Runs a command line version of this {@link Parser}.
- *
- * @param args
- * A single argument (expected at arg[0]) representing a path on the
- * local filesystem that points to a feed file.
- *
- * @throws Exception
- * If any error occurs.
- */
- public static void main(String[] args) throws Exception {
- if (args.length != 1) {
- System.err.println("Usage: FeedParser <feed>");
- System.exit(1);
- }
- String name = args[0];
- String url = "file:" + name;
- Configuration conf = NutchConfiguration.create();
- FeedParser parser = new FeedParser();
- parser.setConf(conf);
- File file = new File(name);
- byte[] bytes = new byte[(int) file.length()];
- DataInputStream in = new DataInputStream(new FileInputStream(file));
- in.readFully(bytes);
- ParseResult parseResult = parser.getParse(new Content(url, url, bytes,
- "application/rss+xml", new Metadata(), conf));
- for (Entry<Text, Parse> entry : parseResult) {
- System.out.println("key: " + entry.getKey());
- Parse parse = entry.getValue();
- System.out.println("data: " + parse.getData());
- System.out.println("text: " + parse.getText() + "\n");
- }
- }
-
- private void addToMap(ParseResult parseResult, SyndFeed feed,
- String feedLink, SyndEntry entry, Content content) {
- String link = entry.getLink(), text = null, title = null;
- Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
- Parse parse = null;
- SyndContent description = entry.getDescription();
-
- try {
- link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
-
- if (link != null)
- link = filters.filter(link);
- } catch (Exception e) {
- e.printStackTrace();
- return;
- }
-
- if (link == null)
- return;
-
- title = stripTags(entry.getTitleEx());
-
- if (feedLink != null)
- parseMeta.set("feed", feedLink);
-
- addFields(parseMeta, contentMeta, feed, entry);
-
- // some item descriptions contain markup text in them,
- // so we temporarily set their content-type to parse them
- // with another plugin
- String contentType = contentMeta.get(Response.CONTENT_TYPE);
-
- if (description != null)
- text = description.getValue();
-
- if (text == null) {
- List<?> contents = entry.getContents();
- StringBuilder buf = new StringBuilder();
- for (Object syndContent : contents) {
- buf.append(((SyndContent) syndContent).getValue());
- }
- text = buf.toString();
- }
-
- try {
- Parser parser = parserFactory.getParsers(contentType, link)[0];
- parse = parser.getParse(
- new Content(link, link, text.getBytes(), contentType, contentMeta,
- conf)).get(link);
- } catch (ParserNotFound e) { /* ignore */
- }
-
- if (parse != null) {
- ParseData data = parse.getData();
- data.getContentMeta().remove(Response.CONTENT_TYPE);
- mergeMetadata(data.getParseMeta(), parseMeta);
- parseResult.put(link, new ParseText(parse.getText()),
- new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
- data.getContentMeta(), data.getParseMeta()));
- } else {
- contentMeta.remove(Response.CONTENT_TYPE);
- parseResult.put(link, new ParseText(text), new ParseData(
- ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta,
- parseMeta));
- }
-
- }
-
- private static String stripTags(SyndContent c) {
- if (c == null)
- return "";
-
- String value = c.getValue();
-
- String[] parts = value.split("<[^>]*>");
- StringBuffer buf = new StringBuffer();
-
- for (String part : parts)
- buf.append(part);
-
- return buf.toString().trim();
- }
-
- private void addFields(Metadata parseMeta, Metadata contentMeta,
- SyndFeed feed, SyndEntry entry) {
- List<?> authors = entry.getAuthors(), categories = entry.getCategories();
- Date published = entry.getPublishedDate(), updated = entry.getUpdatedDate();
- String contentType = null;
-
- if (authors != null) {
- for (Object o : authors) {
- SyndPerson author = (SyndPerson) o;
- String authorName = author.getName();
- if (checkString(authorName)) {
- parseMeta.add(Feed.FEED_AUTHOR, authorName);
- }
- }
- } else {
- // getAuthors may return null if feed is non-atom
- // if so, call getAuthor to get Dublin Core module creator.
- String authorName = entry.getAuthor();
- if (checkString(authorName)) {
- parseMeta.set(Feed.FEED_AUTHOR, authorName);
- }
- }
-
- for (Object i : categories) {
- parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
- }
-
- if (published != null) {
- parseMeta.set(Feed.FEED_PUBLISHED, Long.toString(published.getTime()));
- }
- if (updated != null) {
- parseMeta.set(Feed.FEED_UPDATED, Long.toString(updated.getTime()));
- }
-
- SyndContent description = entry.getDescription();
- if (description != null) {
- contentType = description.getType();
- } else {
- // TODO: What to do if contents.size() > 1?
- List<?> contents = entry.getContents();
- if (contents.size() > 0) {
- contentType = ((SyndContent) contents.get(0)).getType();
- }
- }
-
- if (checkString(contentType)) {
- // ROME may return content-type as html
- if (contentType.equals("html"))
- contentType = "text/html";
- else if (contentType.equals("xhtml"))
- contentType = "text/xhtml";
- contentMeta.set(Response.CONTENT_TYPE, contentType + "; " + CHARSET_UTF8);
- } else {
- contentMeta.set(Response.CONTENT_TYPE, TEXT_PLAIN_CONTENT_TYPE);
- }
-
- }
-
- private void mergeMetadata(Metadata first, Metadata second) {
- for (String name : second.names()) {
- String[] values = second.getValues(name);
- for (String value : values) {
- first.add(name, value);
- }
- }
- }
-
- private boolean checkString(String s) {
- return s != null && !s.equals("");
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
deleted file mode 100644
index 3b15968..0000000
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse RSS feeds.
- */
-package org.apache.nutch.parse.feed;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java b/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
deleted file mode 100644
index 36c8739..0000000
--- a/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.feed;
-
-// JDK imports
-import java.util.Iterator;
-import java.util.Map;
-
-import org.junit.Assert;
-import org.junit.Test;
-// APACHE imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolNotFound;
-import org.apache.nutch.util.NutchConfiguration;
-
-/**
- *
- * @author mattmann
- *
- * Test Suite for the {@link FeedParser}.
- *
- */
-public class TestFeedParser {
-
- private String fileSeparator = System.getProperty("file.separator");
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
-
- // Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/feed/build.xml during plugin compilation.
-
- private String[] sampleFiles = { "rsstest.rss" };
-
- public static final Logger LOG = LoggerFactory.getLogger(TestFeedParser.class
- .getName());
-
- /**
- * Calls the {@link FeedParser} on a sample RSS file and checks that there are
- * 3 {@link ParseResult} entries including the below 2 links:
- * <ul>
- * <li>http://www-scf.usc.edu/~mattmann/</li>
- * <li>http://www.nutch.org</li>
- * </ul>
- *
- *
- * @throws ProtocolNotFound
- * If the {@link Protocol}Layer cannot be loaded (required to fetch
- * the {@link Content} for the RSS file).
- * @throws ParseException
- * If the {@link Parser}Layer cannot be loaded.
- */
- @Test
- public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- ParseResult parseResult;
-
- Configuration conf = NutchConfiguration.create();
- for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
- urlString = urlString.replace('\\', '/');
-
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
-
- parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
-
- Assert.assertEquals(3, parseResult.size());
-
- boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
-
- for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
- .hasNext();) {
- Map.Entry<Text, Parse> entry = j.next();
- if (entry.getKey().toString()
- .equals("http://www-scf.usc.edu/~mattmann/")) {
- hasLink1 = true;
- } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
- hasLink2 = true;
- } else if (entry.getKey().toString().equals(urlString)) {
- hasLink3 = true;
- }
-
- Assert.assertNotNull(entry.getValue());
- Assert.assertNotNull(entry.getValue().getData());
- }
-
- if (!hasLink1 || !hasLink2 || !hasLink3) {
- Assert.fail("Outlinks read from sample rss file are not correct!");
- }
- }
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/headings/build.xml b/src/plugin/headings/build.xml
deleted file mode 100644
index d334ad1..0000000
--- a/src/plugin/headings/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="headings" default="jar-core">
-
- <import file="../build-plugin.xml"/>
-
-</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/headings/ivy.xml b/src/plugin/headings/ivy.xml
deleted file mode 100644
index 5b8393b..0000000
--- a/src/plugin/headings/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
-<ivy-module version="1.0">
- <info organisation="org.apache.nutch" module="${ant.project.name}">
- <license name="Apache 2.0"/>
- <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
- <description>
- Apache Nutch
- </description>
- </info>
-
- <configurations>
- <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
- </configurations>
-
- <publications>
- <!--get the artifact from our module name-->
- <artifact conf="master"/>
- </publications>
-
- <dependencies>
- </dependencies>
-
-</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/headings/plugin.xml b/src/plugin/headings/plugin.xml
deleted file mode 100644
index 0d7921a..0000000
--- a/src/plugin/headings/plugin.xml
+++ /dev/null
@@ -1,45 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
- id="headings"
- name="Headings Parse Filter"
- version="1.0.0"
- provider-name="nutch.org">
-
-
- <runtime>
- <library name="headings.jar">
- <export name="*"/>
- </library>
- </runtime>
-
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
- <extension id="org.apache.nutch.parse.headings"
- name="Nutch Headings Parse Filter"
- point="org.apache.nutch.parse.HtmlParseFilter">
-
- <implementation id="HeadingsParseFilter"
- class="org.apache.nutch.parse.headings.HeadingsParseFilter">
- </implementation>
-
- </extension>
-
-</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
----------------------------------------------------------------------
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
deleted file mode 100644
index 657f260..0000000
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
+++ /dev/null
@@ -1,124 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.parse.headings;
-
-import java.util.ArrayList;
-import java.util.List;
-import java.util.regex.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.parse.HTMLMetaTags;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.HtmlParseFilter;
-import org.apache.nutch.parse.ParseResult;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.NodeWalker;
-import org.w3c.dom.*;
-
-/**
- * HtmlParseFilter to retrieve h1 and h2 values from the DOM.
- */
-public class HeadingsParseFilter implements HtmlParseFilter {
-
- /**
- * Pattern used to strip surpluss whitespace
- */
- protected static Pattern whitespacePattern = Pattern.compile("\\s+");
-
- private Configuration conf;
- private String[] headings;
- private boolean multiValued = false;
-
- public ParseResult filter(Content content, ParseResult parseResult,
- HTMLMetaTags metaTags, DocumentFragment doc) {
- Parse parse = parseResult.get(content.getUrl());
-
- for (int i = 0; headings != null && i < headings.length; i++) {
- List<String> discoveredHeadings = getElement(doc, headings[i]);
-
- if (discoveredHeadings.size() > 0) {
- for (String heading : discoveredHeadings) {
- if (heading != null) {
- heading.trim();
-
- if (heading.length() > 0) {
- parse.getData().getParseMeta().add(headings[i], heading);
- }
- }
- }
- }
- }
-
- return parseResult;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
-
- headings = conf.getStrings("headings");
- multiValued = conf.getBoolean("headings.multivalued", false);
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
- /**
- * Finds the specified element and returns its value
- */
- protected List<String> getElement(DocumentFragment doc, String element) {
- List<String> headings = new ArrayList<String>();
- NodeWalker walker = new NodeWalker(doc);
-
- while (walker.hasNext()) {
- Node currentNode = walker.nextNode();
-
- if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
- if (element.equalsIgnoreCase(currentNode.getNodeName())) {
- headings.add(getNodeValue(currentNode));
-
- // Check for multiValued here, if disabled we don't need
- // to discover more headings.
- if (!multiValued) {
- break;
- }
- }
- }
- }
-
- return headings;
- }
-
- /**
- * Returns the text value of the specified Node and child nodes
- */
- protected static String getNodeValue(Node node) {
- StringBuilder buffer = new StringBuilder();
-
- NodeList children = node.getChildNodes();
-
- for (int i = 0; i < children.getLength(); i++) {
- if (children.item(i).getNodeType() == Node.TEXT_NODE) {
- buffer.append(children.item(i).getNodeValue());
- }
- }
-
- // Return with stripped surplus whitespace
- Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
- return matcher.replaceAll(" ").trim();
- }
-}
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
----------------------------------------------------------------------
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
deleted file mode 100644
index 363e0b2..0000000
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.
- */
-package org.apache.nutch.parse.headings;
-
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/index-anchor/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/index-anchor/build.xml b/src/plugin/index-anchor/build.xml
deleted file mode 100644
index 597b532..0000000
--- a/src/plugin/index-anchor/build.xml
+++ /dev/null
@@ -1,22 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="index-anchor" default="jar-core">
-
- <import file="../build-plugin.xml" />
-
-</project>
\ No newline at end of file