You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2021/06/12 09:59:27 UTC
[nutch] branch master updated: NUTCH-2869 Add @Override annotations
to Nutch plugins - add/complete @Override annotions for methods
implementing interfaces - plugins implementing the ScoringFilter interface:
extend AbstractScoringFilter and get rid of default method implementations
- URL filters/normalizers: remove unused methods including a CrawlDatum
parameter - improve Javadoc and documentation in build and config files
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 0e3e021 NUTCH-2869 Add @Override annotations to Nutch plugins - add/complete @Override annotions for methods implementing interfaces - plugins implementing the ScoringFilter interface: extend AbstractScoringFilter and get rid of default method implementations - URL filters/normalizers: remove unused methods including a CrawlDatum parameter - improve Javadoc and documentation in build and config files
new 41bf0a1 Merge pull request #650 from sebastian-nagel/NUTCH-2869-plugins-override-annotation
0e3e021 is described below
commit 0e3e021d088b03b83de07963cc0c363c90aaacda
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Jun 10 14:50:31 2021 +0200
NUTCH-2869 Add @Override annotations to Nutch plugins
- add/complete @Override annotions for methods
implementing interfaces
- plugins implementing the ScoringFilter interface:
extend AbstractScoringFilter and get rid of default
method implementations
- URL filters/normalizers: remove unused methods
including a CrawlDatum parameter
- improve Javadoc and documentation in build and config files
---
build.xml | 2 +-
conf/nutch-default.xml | 6 ++-
src/java/org/apache/nutch/net/URLFilter.java | 14 +++--
src/java/org/apache/nutch/net/URLFilters.java | 19 ++++---
.../nutch/scoring/AbstractScoringFilter.java | 11 +++-
.../creativecommons/nutch/CCIndexingFilter.java | 3 ++
.../org/creativecommons/nutch/CCParseFilter.java | 3 ++
.../nutch/indexer/feed/FeedIndexingFilter.java | 3 ++
.../org/apache/nutch/parse/feed/FeedParser.java | 3 ++
.../nutch/parse/headings/HeadingsParseFilter.java | 3 ++
.../nutch/indexer/anchor/AnchorIndexingFilter.java | 3 ++
.../nutch/indexer/basic/BasicIndexingFilter.java | 3 ++
.../nutch/indexer/links/LinksIndexingFilter.java | 2 +
.../nutch/indexer/metadata/MetadataIndexer.java | 3 ++
.../nutch/indexer/more/MoreIndexingFilter.java | 3 ++
.../nutch/indexer/replace/ReplaceIndexer.java | 12 ++---
.../indexer/staticfield/StaticFieldIndexer.java | 3 ++
.../nutch/analysis/lang/HTMLLanguageParser.java | 3 ++
.../analysis/lang/LanguageIndexingFilter.java | 3 ++
.../apache/nutch/protocol/http/api/HttpBase.java | 3 ++
.../nutch/urlfilter/api/RegexURLFilterBase.java | 3 ++
.../microformats/reltag/RelTagIndexingFilter.java | 14 ++---
.../nutch/microformats/reltag/RelTagParser.java | 3 ++
.../java/org/apache/nutch/parse/ext/ExtParser.java | 3 ++
.../org/apache/nutch/parse/html/HtmlParser.java | 1 +
.../org/apache/nutch/parse/js/JSParseFilter.java | 2 +
.../nutch/parse/metatags/MetaTagsParser.java | 3 ++
.../org/apache/nutch/parse/tika/TikaParser.java | 3 ++
.../java/org/apache/nutch/parse/zip/ZipParser.java | 3 ++
.../naivebayes/NaiveBayesParseFilter.java | 2 +
.../nutch/parsefilter/regex/RegexParseFilter.java | 3 ++
.../java/org/apache/nutch/protocol/file/File.java | 3 ++
.../java/org/apache/nutch/protocol/ftp/Ftp.java | 4 ++
.../org/apache/nutch/protocol/htmlunit/Http.java | 4 +-
.../nutch/protocol/htmlunit/HttpResponse.java | 5 ++
.../java/org/apache/nutch/protocol/http/Http.java | 2 +
.../org/apache/nutch/protocol/httpclient/Http.java | 2 +
.../org/apache/nutch/protocol/okhttp/OkHttp.java | 2 +
.../nutch/scoring/depth/DepthScoringFilter.java | 1 +
.../scoring/link/LinkAnalysisScoringFilter.java | 36 ++++---------
.../scoring/metadata/MetadataScoringFilter.java | 4 ++
.../nutch/scoring/opic/OPICScoringFilter.java | 10 ++++
.../nutch/scoring/orphan/OrphanScoringFilter.java | 3 ++
.../org/apache/nutch/collection/Subcollection.java | 1 +
.../subcollection/SubcollectionIndexingFilter.java | 3 ++
.../nutch/indexer/tld/TLDIndexingFilter.java | 7 +--
.../apache/nutch/scoring/tld/TLDScoringFilter.java | 61 ++--------------------
.../nutch/urlfilter/domain/DomainURLFilter.java | 3 ++
.../domaindenylist/DomainDenylistURLFilter.java | 3 ++
.../nutch/urlfilter/prefix/PrefixURLFilter.java | 3 ++
.../nutch/urlfilter/suffix/SuffixURLFilter.java | 3 ++
.../nutch/urlfilter/validator/UrlValidator.java | 3 ++
.../indexer/urlmeta/URLMetaIndexingFilter.java | 4 +-
.../scoring/urlmeta/URLMetaScoringFilter.java | 49 +++--------------
.../net/urlnormalizer/ajax/AjaxURLNormalizer.java | 3 ++
.../net/urlnormalizer/host/HostURLNormalizer.java | 3 ++
.../net/urlnormalizer/pass/PassURLNormalizer.java | 3 ++
.../protocol/ProtocolURLNormalizer.java | 7 ++-
.../querystring/QuerystringURLNormalizer.java | 3 ++
.../urlnormalizer/regex/RegexURLNormalizer.java | 2 +
.../urlnormalizer/slash/SlashURLNormalizer.java | 7 ++-
61 files changed, 216 insertions(+), 173 deletions(-)
diff --git a/build.xml b/build.xml
index dcb7b94..1180dea 100644
--- a/build.xml
+++ b/build.xml
@@ -647,7 +647,7 @@
<typefound uri="antlib:org.apache.ivy.ant" name="cleancache" />
</not>
</condition>
- You need Apache Ivy 2.0 or later from http://ant.apache.org/
+ You need Apache Ivy 2.5.0 or later from https://ant.apache.org/
It could not be loaded from ${ivy.repo.url}
</fail>
</target>
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 3e867e6..1e89745 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -2635,7 +2635,11 @@ Add scoring-metadata to the list of active plugins
<name>publisher.order</name>
<value></value>
<description>
- The order in which the publisher queues would be loaded
+ The order in which the publisher queues would be loaded. If
+ empty, all available publishers (see properties plugin-includes
+ and plugin-excludes) are loaded and applied in system defined
+ order. If not empty, only named publishers are loaded and applied
+ in the given order.
</description>
</property>
diff --git a/src/java/org/apache/nutch/net/URLFilter.java b/src/java/org/apache/nutch/net/URLFilter.java
index afbd1e0..6767b98 100644
--- a/src/java/org/apache/nutch/net/URLFilter.java
+++ b/src/java/org/apache/nutch/net/URLFilter.java
@@ -21,17 +21,23 @@ import org.apache.hadoop.conf.Configurable;
import org.apache.nutch.plugin.Pluggable;
/**
- * Interface used to limit which URLs enter Nutch. Used by the injector and the
- * db updater.
+ * Interface used to limit which URLs enter Nutch. Used per default by injector,
+ * fetcher and parser for all URLs seen first (seeds, outlinks, redirects). URL
+ * filters can be optionally enabled for many more Nutch tools.
*/
-
public interface URLFilter extends Pluggable, Configurable {
+
/** The name of the extension point. */
public final static String X_POINT_ID = URLFilter.class.getName();
- /*
+ /**
* Interface for a filter that transforms a URL: it can pass the original URL
* through or "delete" the URL by returning null
+ *
+ * @param urlString
+ * the URL string the filter is applied on
+ * @return the original URL string if the URL is accepted by the filter or
+ * null in case the URL is rejected
*/
public String filter(String urlString);
}
diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java
index ed58650..4404626 100644
--- a/src/java/org/apache/nutch/net/URLFilters.java
+++ b/src/java/org/apache/nutch/net/URLFilters.java
@@ -19,7 +19,11 @@ package org.apache.nutch.net;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.plugin.PluginRepository;
-/** Creates and caches {@link URLFilter} implementing plugins. */
+/**
+ * Creates and caches plugins implementing {@link URLFilter}. Filters URLs using
+ * the active filters defined by the properties "plugin.includes",
+ * "plugin.excludes" and "urlfilter.order".
+ */
public class URLFilters {
public static final String URLFILTER_ORDER = "urlfilter.order";
@@ -34,12 +38,15 @@ public class URLFilters {
return this.filters;
}
- /**
- * Run all defined filters. Assume logical AND.
- * @param urlString to execute filters on
+ /**
+ * Run all defined filters. Assume logical AND. To control performance, the
+ * ULFilter classes can be ordered by the property "urlfilter.order".
+ *
+ * @param urlString
+ * to execute filters on
* @return filtered result
- * @throws URLFilterException if there is an issue executing
- * any URLFilter implementations.
+ * @throws URLFilterException
+ * if there is an issue executing any URLFilter implementations.
*/
public String filter(String urlString) throws URLFilterException {
for (int i = 0; i < this.filters.length; i++) {
diff --git a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
index 94e1732..e6ee206 100644
--- a/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/AbstractScoringFilter.java
@@ -28,48 +28,55 @@ import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
-import org.apache.nutch.scoring.ScoringFilterException;
public abstract class AbstractScoringFilter implements ScoringFilter {
private Configuration conf;
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public void injectedScore(Text url, CrawlDatum datum)
throws ScoringFilterException {
}
+ @Override
public void initialScore(Text url, CrawlDatum datum)
throws ScoringFilterException {
}
+ @Override
public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
throws ScoringFilterException {
return initSort;
}
+ @Override
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
throws ScoringFilterException {
}
+ @Override
public void passScoreAfterParsing(Text url, Content content, Parse parse)
throws ScoringFilterException {
}
+ @Override
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount) throws ScoringFilterException {
return adjust;
}
+ @Override
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
List<CrawlDatum> inlinked) throws ScoringFilterException {
}
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
index e0a4253..bb3560d 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
@@ -49,6 +49,7 @@ public class CCIndexingFilter implements IndexingFilter {
private Configuration conf;
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -115,10 +116,12 @@ public class CCIndexingFilter implements IndexingFilter {
doc.add(FIELD, feature);
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
index ba10432..9e7676d 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
@@ -277,6 +277,7 @@ public class CCParseFilter implements HtmlParseFilter {
* Adds metadata or otherwise modifies a parse of an HTML document, given the
* DOM tree of a page.
*/
+ @Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -307,10 +308,12 @@ public class CCParseFilter implements HtmlParseFilter {
return parseResult;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
index 5a2fa77..901caa6 100644
--- a/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
+++ b/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
@@ -64,6 +64,7 @@ public class FeedIndexingFilter implements IndexingFilter {
* And sends them to the {@link org.apache.nutch.indexer Indexer} for indexing within the Nutch index.
*
*/
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
ParseData parseData = parse.getData();
@@ -107,6 +108,7 @@ public class FeedIndexingFilter implements IndexingFilter {
* @return the {@link Configuration} object used to configure this
* {@link IndexingFilter}.
*/
+ @Override
public Configuration getConf() {
return conf;
}
@@ -119,6 +121,7 @@ public class FeedIndexingFilter implements IndexingFilter {
* The {@link Configuration} object used to configure this
* {@link IndexingFilter}.
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
diff --git a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
index 646c4f9..cecd366 100644
--- a/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
+++ b/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
@@ -100,6 +100,7 @@ public class FeedParser implements Parser {
* present in the feed file that this {@link Parser} dealt with.
*
*/
+ @Override
public ParseResult getParse(Content content) {
SyndFeed feed = null;
ParseResult parseResult = new ParseResult(content.getUrl());
@@ -162,6 +163,7 @@ public class FeedParser implements Parser {
* {@link Parser}.
*
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
this.parserFactory = new ParserFactory(conf);
@@ -176,6 +178,7 @@ public class FeedParser implements Parser {
* @return The {@link Configuration} object used to configure this
* {@link Parser}.
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
index 4b446bb..57d6de9 100644
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
+++ b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
@@ -45,6 +45,7 @@ public class HeadingsParseFilter implements HtmlParseFilter {
private String[] headings;
private boolean multiValued = false;
+ @Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
@@ -68,6 +69,7 @@ public class HeadingsParseFilter implements HtmlParseFilter {
return parseResult;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -75,6 +77,7 @@ public class HeadingsParseFilter implements HtmlParseFilter {
multiValued = conf.getBoolean("headings.multivalued", false);
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
index 2b280d5..7493c31 100644
--- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
+++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
@@ -46,6 +46,7 @@ public class AnchorIndexingFilter implements IndexingFilter {
/**
* Set the {@link Configuration} object
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -56,6 +57,7 @@ public class AnchorIndexingFilter implements IndexingFilter {
/**
* Get the {@link Configuration} object
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
@@ -77,6 +79,7 @@ public class AnchorIndexingFilter implements IndexingFilter {
* The {@link Inlinks} containing anchor text
* @return filtered NutchDocument
*/
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
diff --git a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
index 94cd1fc..0eab1a7 100644
--- a/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
+++ b/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
@@ -68,6 +68,7 @@ public class BasicIndexingFilter implements IndexingFilter {
* The {@link Inlinks} containing anchor text
* @return filtered NutchDocument
*/
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -135,6 +136,7 @@ public class BasicIndexingFilter implements IndexingFilter {
/**
* Set the {@link Configuration} object
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
@@ -145,6 +147,7 @@ public class BasicIndexingFilter implements IndexingFilter {
/**
* Get the {@link Configuration} object
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
index 35370f2..4833237 100644
--- a/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
+++ b/src/plugin/index-links/src/java/org/apache/nutch/indexer/links/LinksIndexingFilter.java
@@ -156,6 +156,7 @@ public class LinksIndexingFilter implements IndexingFilter {
}
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
@@ -164,6 +165,7 @@ public class LinksIndexingFilter implements IndexingFilter {
indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index e2f722c..a8eb8ef 100644
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -49,6 +49,7 @@ public class MetadataIndexer implements IndexingFilter {
private static final String separator_CONF_PROPERTY = "index.metadata.separator";
private static final String mvfields_CONF_PROPERTY = "index.metadata.multivalued.fields";
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -105,6 +106,7 @@ public class MetadataIndexer implements IndexingFilter {
}
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
dbFieldnames = conf.getStrings(db_CONF_PROPERTY);
@@ -119,6 +121,7 @@ public class MetadataIndexer implements IndexingFilter {
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
index 2a475c5..6f40359 100644
--- a/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
+++ b/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
@@ -105,6 +105,7 @@ public class MoreIndexingFilter implements IndexingFilter {
"yyyy-MM-dd'T'HH:mm:ssXXX" };
private String[] dateStyles = null;
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -311,6 +312,7 @@ public class MoreIndexingFilter implements IndexingFilter {
return doc;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
MIME = new MimeUtil(conf);
@@ -352,6 +354,7 @@ public class MoreIndexingFilter implements IndexingFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
index 503310a..8dde66f 100644
--- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
+++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
@@ -102,9 +102,7 @@ public class ReplaceIndexer implements IndexingFilter {
private Configuration conf;
- /**
- * {@inheritDoc}
- */
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
FIELDREPLACERS_BY_HOST.clear();
@@ -116,9 +114,7 @@ public class ReplaceIndexer implements IndexingFilter {
}
}
- /**
- * {@inheritDoc}
- */
+ @Override
public Configuration getConf() {
return this.conf;
}
@@ -233,9 +229,7 @@ public class ReplaceIndexer implements IndexingFilter {
}
}
- /**
- * {@inheritDoc}
- */
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
index bd68dd1..c022ca7 100644
--- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
+++ b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
@@ -59,6 +59,7 @@ public class StaticFieldIndexer implements IndexingFilter {
* The {@link Inlinks} containing anchor text
* @return filtered NutchDocument
*/
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -99,6 +100,7 @@ public class StaticFieldIndexer implements IndexingFilter {
/**
* Set the {@link Configuration} object
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -117,6 +119,7 @@ public class StaticFieldIndexer implements IndexingFilter {
/**
* Get the {@link Configuration} object
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
index 28878dc..41fe099 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
@@ -85,6 +85,7 @@ public class HTMLLanguageParser implements HtmlParseFilter {
* -html.shtml#language) <li>3. meta http-equiv (content-language)
* (http://www.w3.org/TR/REC-html40/struct/global.html#h-7.4.4.2) <br></ul>
*/
+ @Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
String lang = null;
@@ -300,6 +301,7 @@ public class HTMLLanguageParser implements HtmlParseFilter {
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
contentMaxlength = conf.getInt("lang.analyze.max.length", -1);
@@ -314,6 +316,7 @@ public class HTMLLanguageParser implements HtmlParseFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
index 10289e5..ed9362e 100644
--- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
+++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
@@ -60,6 +60,7 @@ public class LanguageIndexingFilter implements IndexingFilter {
}
// Inherited JavaDoc
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -84,11 +85,13 @@ public class LanguageIndexingFilter implements IndexingFilter {
return doc;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
indexLangs = new HashSet<>(conf.getStringCollection("lang.index.languages"));
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 58dfbfe..ce999b3 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -203,6 +203,7 @@ public abstract class HttpBase implements Protocol {
robots = new HttpRobotRulesParser();
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
this.proxyHost = conf.get("http.proxy.host");
@@ -373,10 +374,12 @@ public abstract class HttpBase implements Protocol {
logConf();
}
+ @Override
public Configuration getConf() {
return this.conf;
}
+ @Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index af54c00..0ddb698 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -168,6 +168,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
protected abstract Reader getRulesReader(Configuration conf)
throws IOException;
+ @Override
public String filter(String url) {
String host = null;
String domain = null;
@@ -205,6 +206,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
return null;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
Reader reader = null;
@@ -226,6 +228,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
index e0fcfa7..b2121d9 100644
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
+++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
@@ -38,7 +38,7 @@ public class RelTagIndexingFilter implements IndexingFilter {
private Configuration conf;
- // Inherited JavaDoc
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -54,22 +54,14 @@ public class RelTagIndexingFilter implements IndexingFilter {
return doc;
}
- /*
- * ----------------------------- * <implementation:Configurable> *
- * -----------------------------
- */
-
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
- /*
- * ------------------------------ * </implementation:Configurable> *
- * ------------------------------
- */
-
}
diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
index 0efcbb3..3d96a7b 100644
--- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
+++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
@@ -58,6 +58,7 @@ public class RelTagParser implements HtmlParseFilter {
/**
* Scan the HTML document looking at possible rel-tags
*/
+ @Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
@@ -136,10 +137,12 @@ public class RelTagParser implements HtmlParseFilter {
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
index dfebb53..525cfc1 100644
--- a/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
+++ b/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
@@ -66,6 +66,7 @@ public class ExtParser implements Parser {
public ExtParser() {
}
+ @Override
public ParseResult getParse(Content content) {
String contentType = content.getContentType();
@@ -141,6 +142,7 @@ public class ExtParser implements Parser {
parseData));
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
Extension[] extensions = PluginRepository.get(conf)
@@ -177,6 +179,7 @@ public class ExtParser implements Parser {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index 5852b14..6a6d49d 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -143,6 +143,7 @@ public class HtmlParser implements Parser {
private String cachingPolicy;
+ @Override
public ParseResult getParse(Content content) {
HTMLMetaTags metaTags = new HTMLMetaTags();
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
index e6527e2..c27ef4a 100644
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -302,10 +302,12 @@ public class JSParseFilter implements HtmlParseFilter, Parser {
System.out.println(" - " + links[i]);
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 8deaf18..6cef438 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -48,6 +48,7 @@ public class MetaTagsParser implements HtmlParseFilter {
private Set<String> metatagset = new HashSet<String>();
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
// specify whether we want a specific subset of metadata
@@ -58,6 +59,7 @@ public class MetaTagsParser implements HtmlParseFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
@@ -95,6 +97,7 @@ public class MetaTagsParser implements HtmlParseFilter {
}
}
+ @Override
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index d97e8b4..4b79eee 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -80,6 +80,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
private String boilerpipeExtractorName;
private Set<String> boilerpipeMimeTypes;
+ @Override
public ParseResult getParse(Content content) {
HTMLDocumentImpl doc = new HTMLDocumentImpl();
doc.setErrorChecking(false);
@@ -257,6 +258,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
return filteredParse;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
this.tikaConfig = null;
@@ -324,6 +326,7 @@ public class TikaParser implements org.apache.nutch.parse.Parser {
parseEmbedded = conf.getBoolean("tika.parse.embedded", true);
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
index c4b953e..a605f3b 100644
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
+++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
@@ -54,6 +54,7 @@ public class ZipParser implements Parser {
public ZipParser() {
}
+ @Override
public ParseResult getParse(final Content content) {
String resultText = null;
@@ -109,10 +110,12 @@ public class ZipParser implements Parser {
resultText, parseData));
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
index 25354bd..76821a2 100644
--- a/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
+++ b/src/plugin/parsefilter-naivebayes/src/java/org/apache/nutch/parsefilter/naivebayes/NaiveBayesParseFilter.java
@@ -104,6 +104,7 @@ public class NaiveBayesParseFilter implements HtmlParseFilter {
return false;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
inputFilePath = conf.get(TRAINFILE_MODELFILTER);
@@ -150,6 +151,7 @@ public class NaiveBayesParseFilter implements HtmlParseFilter {
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 6e86fc6..bc17eb0 100644
--- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -56,6 +56,7 @@ public class RegexParseFilter implements HtmlParseFilter {
private static final Map<String,RegexRule> rules = new HashMap<>();
+ @Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String html = new String(content.getContent());
@@ -87,6 +88,7 @@ public class RegexParseFilter implements HtmlParseFilter {
return parseResult;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -139,6 +141,7 @@ public class RegexParseFilter implements HtmlParseFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index 4120cbb..d55e42e 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -70,6 +70,7 @@ public class File implements Protocol {
/**
* Set the {@link Configuration} object
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
this.maxContentLength = conf.getInt("file.content.limit", 1024 * 1024);
@@ -81,6 +82,7 @@ public class File implements Protocol {
/**
* Get the {@link Configuration} object
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
@@ -105,6 +107,7 @@ public class File implements Protocol {
* @return {@link ProtocolOutput} object for the content of the file indicated
* by url
*/
+ @Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 470e151..2a47b63 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -140,6 +140,7 @@ public class Ftp implements Protocol {
*
* @return {@link ProtocolOutput} object for the url
*/
+ @Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
@@ -186,6 +187,7 @@ public class Ftp implements Protocol {
}
}
+ @Override
protected void finalize() {
try {
if (this.client != null && this.client.isConnected()) {
@@ -272,6 +274,7 @@ public class Ftp implements Protocol {
/**
* Set the {@link Configuration} object
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
this.maxContentLength = conf.getInt("ftp.content.limit", 1024 * 1024);
@@ -287,6 +290,7 @@ public class Ftp implements Protocol {
/**
* Get the {@link Configuration} object
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
index b093e5c..40a6941 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -48,6 +48,7 @@ public class Http extends HttpBase {
*
* @param conf a popultaed {@link Configuration}
*/
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
}
@@ -57,7 +58,8 @@ public class Http extends HttpBase {
http.setConf(NutchConfiguration.create());
main(http, args);
}
-
+
+ @Override
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
return new HttpResponse(this, url, datum);
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 58e809a..ae876e0 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -305,22 +305,27 @@ public class HttpResponse implements Response {
* -------------------------
*/
+ @Override
public URL getUrl() {
return url;
}
+ @Override
public int getCode() {
return code;
}
+ @Override
public String getHeader(String name) {
return headers.get(name);
}
+ @Override
public Metadata getHeaders() {
return headers;
}
+ @Override
public byte[] getContent() {
return content;
}
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
index b85c47a..cc10221 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
@@ -48,6 +48,7 @@ public class Http extends HttpBase {
*
* @param conf a populated {@link Configuration}
*/
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
}
@@ -58,6 +59,7 @@ public class Http extends HttpBase {
main(http, args);
}
+ @Override
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
return new HttpResponse(this, url, datum);
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index 2247f5e..5942486 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -128,6 +128,7 @@ public class Http extends HttpBase {
* @param conf
* Configuration
*/
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
Http.conf = conf;
@@ -174,6 +175,7 @@ public class Http extends HttpBase {
* Follow redirects if and only if true
* @return HTTP response
*/
+ @Override
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
resolveCredentials(url);
diff --git a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
index 9fbe9fa..65cb2d3 100644
--- a/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
+++ b/src/plugin/protocol-okhttp/src/java/org/apache/nutch/protocol/okhttp/OkHttp.java
@@ -106,6 +106,7 @@ public class OkHttp extends HttpBase {
super(LOG);
}
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
@@ -328,6 +329,7 @@ public class OkHttp extends HttpBase {
return client;
}
+ @Override
protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
throws ProtocolException, IOException {
return new OkHttpResponse(this, url, datum);
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index 29b119b..e6aa7a6 100644
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -138,6 +138,7 @@ public class DepthScoringFilter extends Configured implements ScoringFilter {
return initSort * (1 + mul);
}
+ @Override
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
diff --git a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
index c98ccce..41895ea 100644
--- a/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
+++ b/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
@@ -16,9 +16,7 @@
*/
package org.apache.nutch.scoring.link;
-import java.util.Collection;
import java.util.List;
-import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
@@ -27,41 +25,31 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
-public class LinkAnalysisScoringFilter implements ScoringFilter {
+public class LinkAnalysisScoringFilter extends AbstractScoringFilter {
- private Configuration conf;
private float normalizedScore = 1.00f;
private float initialScore = 0.0f;
public LinkAnalysisScoringFilter() {
-
- }
-
- public Configuration getConf() {
- return conf;
}
+ @Override
public void setConf(Configuration conf) {
- this.conf = conf;
+ super.setConf(conf);
normalizedScore = conf.getFloat("link.analyze.normalize.score", 1.00f);
}
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount) throws ScoringFilterException {
- return adjust;
- }
-
+ @Override
public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
throws ScoringFilterException {
return datum.getScore() * initSort;
}
+ @Override
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
@@ -71,29 +59,23 @@ public class LinkAnalysisScoringFilter implements ScoringFilter {
return (normalizedScore * dbDatum.getScore());
}
+ @Override
public void initialScore(Text url, CrawlDatum datum)
throws ScoringFilterException {
datum.setScore(initialScore);
}
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- }
-
+ @Override
public void passScoreAfterParsing(Text url, Content content, Parse parse)
throws ScoringFilterException {
parse.getData().getContentMeta()
.set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
}
+ @Override
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
throws ScoringFilterException {
content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
}
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked) throws ScoringFilterException {
- // nothing to do
- }
-
}
diff --git a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java
index e3ad56e..489491c 100644
--- a/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java
+++ b/src/plugin/scoring-metadata/src/java/org/apache/nutch/scoring/metadata/MetadataScoringFilter.java
@@ -58,6 +58,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter {
*
* @see ScoringFilter#distributeScoreToOutlinks
*/
+ @Override
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount) throws ScoringFilterException {
@@ -90,6 +91,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter {
* @see ScoringFilter#passScoreBeforeParsing
* @see MetadataScoringFilter#passScoreAfterParsing
*/
+ @Override
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
if (datumMetadata == null || content == null || datum == null)
return;
@@ -112,6 +114,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter {
* @see MetadataScoringFilter#passScoreBeforeParsing
* @see ScoringFilter#passScoreAfterParsing
*/
+ @Override
public void passScoreAfterParsing(Text url, Content content, Parse parse) {
if (contentMetadata == null || content == null || parse == null)
return;
@@ -130,6 +133,7 @@ public class MetadataScoringFilter extends AbstractScoringFilter {
* handles conf assignment and pulls the value assignment from the
* "scoring.db.md", "scoring.content.md" and "scoring.parse.md" properties.
*/
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
diff --git a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
index 4c6c36b..54e2fe5 100644
--- a/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
+++ b/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
@@ -58,10 +58,12 @@ public class OPICScoringFilter implements ScoringFilter {
private float externalScoreFactor;
private boolean countFiltered;
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
scorePower = conf.getFloat("indexer.score.power", 0.5f);
@@ -70,6 +72,7 @@ public class OPICScoringFilter implements ScoringFilter {
countFiltered = conf.getBoolean("db.score.count.filtered", false);
}
+ @Override
public void injectedScore(Text url, CrawlDatum datum)
throws ScoringFilterException {
}
@@ -78,18 +81,21 @@ public class OPICScoringFilter implements ScoringFilter {
* Set to 0.0f (unknown value) - inlink contributions will bring it to a
* correct level. Newly discovered pages have at least one inlink.
*/
+ @Override
public void initialScore(Text url, CrawlDatum datum)
throws ScoringFilterException {
datum.setScore(0.0f);
}
/** Use {@link CrawlDatum#getScore()}. */
+ @Override
public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
throws ScoringFilterException {
return datum.getScore() * initSort;
}
/** Increase the score by a sum of inlinked scores. */
+ @Override
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
List<CrawlDatum> inlinked) throws ScoringFilterException {
float adjust = 0.0f;
@@ -103,11 +109,13 @@ public class OPICScoringFilter implements ScoringFilter {
}
/** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
+ @Override
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
}
/** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
+ @Override
public void passScoreAfterParsing(Text url, Content content, Parse parse) {
parse.getData().getContentMeta()
.set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
@@ -117,6 +125,7 @@ public class OPICScoringFilter implements ScoringFilter {
* Get a float value from Fetcher.SCORE_KEY, divide it by the number of
* outlinks and apply.
*/
+ @Override
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount) throws ScoringFilterException {
@@ -163,6 +172,7 @@ public class OPICScoringFilter implements ScoringFilter {
}
/** Dampen the boost value by scorePower. */
+ @Override
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
diff --git a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
index 3471a95..a0ab439 100644
--- a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
+++ b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
@@ -46,6 +46,7 @@ public class OrphanScoringFilter extends AbstractScoringFilter {
private long markGoneAfter = DEFAULT_GONE_TIME;
private long markOrphanAfter = DEFAULT_ORPHAN_TIME;
+ @Override
public void setConf(Configuration conf) {
markGoneAfter = conf.getInt("scoring.orphan.mark.gone.after",
DEFAULT_GONE_TIME);
@@ -71,6 +72,7 @@ public class OrphanScoringFilter extends AbstractScoringFilter {
* @param inlinks
* list of inlinked CrawlDatums
*/
+ @Override
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
List<CrawlDatum> inlinks) throws ScoringFilterException {
@@ -86,6 +88,7 @@ public class OrphanScoringFilter extends AbstractScoringFilter {
}
}
+ @Override
public void orphanedScore(Text url, CrawlDatum datum) {
// Already has an orphaned time?
if (datum.getMetaData().containsKey(ORPHAN_KEY_WRITABLE)) {
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
index 007eeae..b82ffd6 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
@@ -180,6 +180,7 @@ public class Subcollection extends Configured implements URLFilter {
*
* @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
*/
+ @Override
public String filter(String urlString) {
// first the blacklist
Iterator<String> i = blackList.iterator();
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index c7ba54e..6aaa452 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -49,6 +49,7 @@ public class SubcollectionIndexingFilter extends Configured implements
/**
* @param conf A populated {@link Configuration}
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
fieldName = conf.get("subcollection.default.fieldname", "subcollection");
@@ -60,6 +61,7 @@ public class SubcollectionIndexingFilter extends Configured implements
/**
* @return Configuration
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
@@ -91,6 +93,7 @@ public class SubcollectionIndexingFilter extends Configured implements
}
}
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Check for subcollection overrride in HTML metadata
diff --git a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
index 4f3a92c..296124d 100644
--- a/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
+++ b/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -33,9 +33,7 @@ import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.domain.DomainSuffix;
/**
- * Adds the Top level domain extensions to the index
- *
- * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ * Adds the top-level domain extensions to the index
*/
public class TLDIndexingFilter implements IndexingFilter {
private static final Logger LOG = LoggerFactory
@@ -43,6 +41,7 @@ public class TLDIndexingFilter implements IndexingFilter {
private Configuration conf;
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -59,10 +58,12 @@ public class TLDIndexingFilter implements IndexingFilter {
return doc;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
index 95891dd..5f30809 100644
--- a/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
+++ b/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
@@ -16,10 +16,6 @@
*/
package org.apache.nutch.scoring.tld;
-import java.util.List;
-import java.util.Collection;
-import java.util.Map.Entry;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -27,27 +23,23 @@ import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.NutchField;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseData;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.util.domain.DomainSuffix;
import org.apache.nutch.util.domain.DomainSuffixes;
/**
- * Scoring filter to boost tlds.
- *
- * @author Enis Soztutar <enis.soz.nutch@gmail.com>
+ * Scoring filter to boost top-level domains (TLDs).
*/
-public class TLDScoringFilter implements ScoringFilter {
+public class TLDScoringFilter extends AbstractScoringFilter {
- private Configuration conf;
private DomainSuffixes tldEntries;
public TLDScoringFilter() {
tldEntries = DomainSuffixes.getInstance();
}
+ @Override
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
throws ScoringFilterException {
@@ -65,49 +57,4 @@ public class TLDScoringFilter implements ScoringFilter {
return initScore * boost;
}
- public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
- ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
- int validCount) throws ScoringFilterException {
- return adjust;
- }
-
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
- return initSort;
- }
-
- public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- }
-
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- }
-
- public void passScoreAfterParsing(Text url, Content content, Parse parse)
- throws ScoringFilterException {
- }
-
- public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
- throws ScoringFilterException {
- }
-
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked) throws ScoringFilterException {
- }
-
- public Configuration getConf() {
- return conf;
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount) throws ScoringFilterException {
- return adjust;
- }
-
}
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index f629262..c68750c 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -101,6 +101,7 @@ public class DomainURLFilter implements URLFilter {
/**
* Sets the configuration.
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -149,10 +150,12 @@ public class DomainURLFilter implements URLFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
+ @Override
public String filter(String url) {
// https://issues.apache.org/jira/browse/NUTCH-2189
if (domainSet.size() == 0) return url;
diff --git a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java
index 58e3754..7b38bfc 100644
--- a/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java
+++ b/src/plugin/urlfilter-domaindenylist/src/java/org/apache/nutch/urlfilter/domaindenylist/DomainDenylistURLFilter.java
@@ -101,6 +101,7 @@ public class DomainDenylistURLFilter implements URLFilter {
/**
* Sets the configuration.
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -150,10 +151,12 @@ public class DomainDenylistURLFilter implements URLFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
+ @Override
public String filter(String url) {
try {
// match for suffix, domain, and host in that order. more general will
diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index c54740a..ccba29c 100644
--- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -66,6 +66,7 @@ public class PrefixURLFilter implements URLFilter {
trie = readConfiguration(new StringReader(stringRules));
}
+ @Override
public String filter(String url) {
if (trie.shortestMatch(url) == null)
return null;
@@ -115,6 +116,7 @@ public class PrefixURLFilter implements URLFilter {
}
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -165,6 +167,7 @@ public class PrefixURLFilter implements URLFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index ff3826a..dd8605f 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -141,6 +141,7 @@ public class SuffixURLFilter implements URLFilter {
readConfiguration(reader);
}
+ @Override
public String filter(String url) {
if (url == null)
return null;
@@ -249,6 +250,7 @@ public class SuffixURLFilter implements URLFilter {
}
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -293,6 +295,7 @@ public class SuffixURLFilter implements URLFilter {
}
}
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
index 84d516b..14fed8a 100644
--- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
+++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
@@ -132,14 +132,17 @@ public class UrlValidator implements URLFilter {
private Configuration conf;
+ @Override
public String filter(String urlString) {
return isValid(urlString) ? urlString : null;
}
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
index e34e087..557b8eb 100644
--- a/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
+++ b/src/plugin/urlmeta/src/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
@@ -75,6 +75,7 @@ public class URLMetaIndexingFilter implements IndexingFilter {
*
* @see IndexingFilter#filter
*/
+ @Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
if (conf != null)
@@ -93,7 +94,7 @@ public class URLMetaIndexingFilter implements IndexingFilter {
return doc;
}
- /** Boilerplate */
+ @Override
public Configuration getConf() {
return conf;
}
@@ -102,6 +103,7 @@ public class URLMetaIndexingFilter implements IndexingFilter {
* handles conf assignment and pulls the value assignment from the
* "urlmeta.tags" property
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
diff --git a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
index 1b179ba..cb7e1b0 100644
--- a/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
+++ b/src/plugin/urlmeta/src/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
@@ -17,19 +17,16 @@
package org.apache.nutch.scoring.urlmeta;
import java.util.Collection;
-import java.util.Map.Entry;
import java.util.Iterator;
-import java.util.List;
+import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.AbstractScoringFilter;
import org.apache.nutch.scoring.ScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
@@ -38,11 +35,10 @@ import org.apache.nutch.scoring.ScoringFilterException;
*
* {@link org.apache.nutch.scoring.urlmeta}
*/
-public class URLMetaScoringFilter extends Configured implements ScoringFilter {
+public class URLMetaScoringFilter extends AbstractScoringFilter {
private static final String CONF_PROPERTY = "urlmeta.tags";
private static String[] urlMetaTags;
- private Configuration conf;
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
@@ -52,6 +48,7 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter {
*
* @see ScoringFilter#distributeScoreToOutlinks
*/
+ @Override
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
CrawlDatum adjust, int allCount) throws ScoringFilterException {
@@ -84,6 +81,7 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter {
* @see ScoringFilter#passScoreBeforeParsing
* @see URLMetaScoringFilter#passScoreAfterParsing
*/
+ @Override
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
if (urlMetaTags == null || content == null || datum == null)
return;
@@ -105,6 +103,7 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter {
* @see URLMetaScoringFilter#passScoreBeforeParsing
* @see ScoringFilter#passScoreAfterParsing
*/
+ @Override
public void passScoreAfterParsing(Text url, Content content, Parse parse) {
if (urlMetaTags == null || content == null || parse == null)
return;
@@ -119,41 +118,11 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter {
}
}
- /** Boilerplate */
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
- return initSort;
- }
-
- /** Boilerplate */
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
- return initScore;
- }
-
- /** Boilerplate */
- public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- return;
- }
-
- /** Boilerplate */
- public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
- return;
- }
-
- /** Boilerplate */
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked) throws ScoringFilterException {
- return;
- }
-
/**
* handles conf assignment and pulls the value assignment from the
* "urlmeta.tags" property
*/
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
@@ -163,8 +132,4 @@ public class URLMetaScoringFilter extends Configured implements ScoringFilter {
urlMetaTags = conf.getStrings(CONF_PROPERTY);
}
- /** Boilerplate */
- public Configuration getConf() {
- return conf;
- }
}
diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
index b596400..7c55cd7 100644
--- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
+++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -60,6 +60,7 @@ public class AjaxURLNormalizer implements URLNormalizer {
* @return String
* @throws MalformedURLException if the urlString is malformed
*/
+ @Override
public String normalize(String urlString, String scope) throws MalformedURLException {
LOG.info(scope + " // " + urlString);
@@ -224,6 +225,7 @@ public class AjaxURLNormalizer implements URLNormalizer {
/**
* @param conf a populated {@link Configuration}
*/
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@@ -231,6 +233,7 @@ public class AjaxURLNormalizer implements URLNormalizer {
/**
* @return Configuration
*/
+ @Override
public Configuration getConf() {
return this.conf;
}
diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index 3a3c8a4..537868b 100644
--- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -77,10 +77,12 @@ public class HostURLNormalizer implements URLNormalizer {
}
}
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -137,6 +139,7 @@ public class HostURLNormalizer implements URLNormalizer {
}
}
+ @Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
String host = new URL(urlString).getHost();
diff --git a/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java b/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
index 717471c..18e5fc4 100644
--- a/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
+++ b/src/plugin/urlnormalizer-pass/src/java/org/apache/nutch/net/urlnormalizer/pass/PassURLNormalizer.java
@@ -32,15 +32,18 @@ public class PassURLNormalizer implements URLNormalizer {
private Configuration conf;
+ @Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
return urlString;
}
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index e1afde8..d747858 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -122,10 +122,12 @@ public class ProtocolURLNormalizer implements URLNormalizer {
protocolsMap.size(), domainProtocolsMap.size());
}
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -185,11 +187,8 @@ public class ProtocolURLNormalizer implements URLNormalizer {
}
}
+ @Override
public String normalize(String url, String scope) throws MalformedURLException {
- return normalize(url, null, scope);
- }
-
- public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException {
// Get URL repr.
URL u = new URL(url);
diff --git a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
index 60ec55e..f8a547b 100644
--- a/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
+++ b/src/plugin/urlnormalizer-querystring/src/java/org/apache/nutch/net/urlnormalizer/querystring/QuerystringURLNormalizer.java
@@ -39,14 +39,17 @@ public class QuerystringURLNormalizer implements URLNormalizer {
public QuerystringURLNormalizer() {
}
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
}
+ @Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
URL url = new URL(urlString);
diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
index 885944e..c86d55a 100644
--- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
@@ -123,6 +123,7 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer {
}
}
+ @Override
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf == null)
@@ -202,6 +203,7 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer {
return urlString;
}
+ @Override
public String normalize(String urlString, String scope)
throws MalformedURLException {
return regexNormalize(urlString, scope);
diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index 2570427..ce3128d 100644
--- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -88,10 +88,12 @@ public class SlashURLNormalizer implements URLNormalizer {
}
}
+ @Override
public Configuration getConf() {
return conf;
}
+ @Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -150,11 +152,8 @@ public class SlashURLNormalizer implements URLNormalizer {
}
}
+ @Override
public String normalize(String url, String scope) throws MalformedURLException {
- return normalize(url, null, scope);
- }
-
- public String normalize(String url, CrawlDatum crawlDatum, String scope) throws MalformedURLException {
// Get URL repr.
URL u = new URL(url);