You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2021/02/11 17:28:06 UTC
[nutch] branch master updated: NUTCH-2842 Fix Javadoc warnings,
errors and add Javadoc check to Github Action and Jenkins (#568)
This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 64bf638 NUTCH-2842 Fix Javadoc warnings, errors and add Javadoc check to Github Action and Jenkins (#568)
64bf638 is described below
commit 64bf63811f741c97f5ee5516dd64da0087348a97
Author: Lewis John McGibbney <le...@gmail.com>
AuthorDate: Thu Feb 11 09:27:59 2021 -0800
NUTCH-2842 Fix Javadoc warnings, errors and add Javadoc check to Github Action and Jenkins (#568)
* NUTCH-2842 Fix Javadoc warnings and add Javadoc check to Github Action and Jenkins
---
build.xml | 16 +++--
default.properties | 6 +-
src/java/org/apache/nutch/crawl/CrawlDatum.java | 25 +++++--
src/java/org/apache/nutch/crawl/CrawlDbMerger.java | 4 +-
src/java/org/apache/nutch/crawl/FetchSchedule.java | 6 +-
.../apache/nutch/crawl/FetchScheduleFactory.java | 10 ++-
src/java/org/apache/nutch/crawl/Generator.java | 80 ++++++++++++++++++++--
src/java/org/apache/nutch/crawl/Inlink.java | 8 ++-
src/java/org/apache/nutch/crawl/Inlinks.java | 3 +-
src/java/org/apache/nutch/crawl/LinkDbMerger.java | 4 +-
.../org/apache/nutch/crawl/SignatureFactory.java | 9 ++-
src/java/org/apache/nutch/fetcher/FetchItem.java | 22 +++++-
.../org/apache/nutch/fetcher/FetchItemQueues.java | 2 +-
src/java/org/apache/nutch/fetcher/Fetcher.java | 6 +-
.../apache/nutch/fetcher/FetcherThreadEvent.java | 20 +++---
src/java/org/apache/nutch/hostdb/HostDatum.java | 5 +-
.../org/apache/nutch/hostdb/ResolverThread.java | 8 ++-
.../apache/nutch/hostdb/UpdateHostDbMapper.java | 16 ++---
.../apache/nutch/hostdb/UpdateHostDbReducer.java | 8 +--
src/java/org/apache/nutch/indexer/IndexWriter.java | 5 +-
.../org/apache/nutch/indexer/IndexWriters.java | 4 +-
.../org/apache/nutch/indexer/IndexerMapReduce.java | 2 +-
.../org/apache/nutch/indexer/IndexingFilter.java | 2 +-
.../org/apache/nutch/indexer/IndexingFilters.java | 14 +++-
.../org/apache/nutch/metadata/MetaWrapper.java | 36 +++++-----
.../org/apache/nutch/net/URLExemptionFilters.java | 9 ++-
src/java/org/apache/nutch/net/URLFilters.java | 8 ++-
.../apache/nutch/net/protocols/HttpDateFormat.java | 2 +
.../nutch/net/protocols/ProtocolLogUtil.java | 3 +
.../org/apache/nutch/net/protocols/Response.java | 26 +++++--
src/java/org/apache/nutch/parse/HTMLMetaTags.java | 30 +++++---
.../org/apache/nutch/parse/HtmlParseFilter.java | 8 +++
.../org/apache/nutch/parse/HtmlParseFilters.java | 12 +++-
src/java/org/apache/nutch/parse/Outlink.java | 9 ++-
.../org/apache/nutch/parse/OutlinkExtractor.java | 10 ++-
src/java/org/apache/nutch/parse/Parse.java | 11 ++-
src/java/org/apache/nutch/parse/ParseData.java | 35 +++++-----
src/java/org/apache/nutch/parse/ParseResult.java | 5 +-
src/java/org/apache/nutch/parse/ParseSegment.java | 4 +-
src/java/org/apache/nutch/parse/ParseStatus.java | 33 ++++++---
src/java/org/apache/nutch/parse/ParseUtil.java | 4 +-
src/java/org/apache/nutch/parse/ParserFactory.java | 2 +
src/java/org/apache/nutch/plugin/Extension.java | 14 ++--
.../org/apache/nutch/plugin/ExtensionPoint.java | 3 +-
src/java/org/apache/nutch/plugin/Plugin.java | 5 +-
.../org/apache/nutch/plugin/PluginClassLoader.java | 7 +-
.../org/apache/nutch/plugin/PluginDescriptor.java | 61 ++++++++++-------
.../apache/nutch/plugin/PluginManifestParser.java | 2 +
.../org/apache/nutch/plugin/PluginRepository.java | 25 ++++---
src/java/org/apache/nutch/protocol/Content.java | 26 ++++---
src/java/org/apache/nutch/protocol/Protocol.java | 5 +-
.../apache/nutch/protocol/RobotRulesParser.java | 2 +
.../org/apache/nutch/publisher/NutchPublisher.java | 6 +-
.../org/apache/nutch/scoring/ScoringFilter.java | 29 ++++++--
.../apache/nutch/scoring/webgraph/LinkDumper.java | 6 ++
.../apache/nutch/scoring/webgraph/LinkRank.java | 6 +-
.../apache/nutch/scoring/webgraph/NodeDumper.java | 28 +++++---
.../apache/nutch/scoring/webgraph/NodeReader.java | 3 +
.../nutch/scoring/webgraph/ScoreUpdater.java | 4 +-
.../apache/nutch/scoring/webgraph/WebGraph.java | 4 ++
.../org/apache/nutch/segment/SegmentChecker.java | 23 +++++++
.../apache/nutch/segment/SegmentMergeFilter.java | 9 ++-
.../apache/nutch/segment/SegmentMergeFilters.java | 9 ++-
.../org/apache/nutch/segment/SegmentMerger.java | 6 +-
src/java/org/apache/nutch/segment/SegmentPart.java | 6 +-
.../org/apache/nutch/segment/SegmentReader.java | 9 ++-
src/java/org/apache/nutch/service/JobManager.java | 2 +-
.../apache/nutch/service/impl/ConfManagerImpl.java | 2 +-
.../org/apache/nutch/service/impl/JobWorker.java | 7 +-
.../service/impl/NutchServerPoolExecutor.java | 20 +++---
.../nutch/service/model/request/JobConfig.java | 4 ++
.../nutch/service/model/response/JobInfo.java | 2 -
.../nutch/service/resources/AdminResource.java | 6 +-
.../nutch/service/resources/ConfigResource.java | 2 +-
.../nutch/service/resources/JobResource.java | 10 +--
.../nutch/service/resources/ReaderResouce.java | 28 ++++----
.../nutch/service/resources/SeedResource.java | 6 +-
.../apache/nutch/tools/CommonCrawlDataDumper.java | 10 ++-
.../org/apache/nutch/tools/CommonCrawlFormat.java | 34 +++++----
src/java/org/apache/nutch/tools/DmozParser.java | 15 ++++
src/java/org/apache/nutch/tools/FileDumper.java | 9 +--
src/java/org/apache/nutch/tools/ResolveUrls.java | 2 +
src/java/org/apache/nutch/tools/WARCUtils.java | 1 +
.../org/apache/nutch/tools/arc/ArcInputFormat.java | 5 +-
.../apache/nutch/tools/arc/ArcRecordReader.java | 5 +-
.../apache/nutch/tools/arc/ArcSegmentCreator.java | 16 ++---
src/java/org/apache/nutch/util/CommandRunner.java | 18 +----
src/java/org/apache/nutch/util/DeflateUtils.java | 10 ++-
src/java/org/apache/nutch/util/DomUtil.java | 16 +++--
.../org/apache/nutch/util/EncodingDetector.java | 2 +
src/java/org/apache/nutch/util/GZIPUtils.java | 10 ++-
src/java/org/apache/nutch/util/HadoopFSUtil.java | 11 ++-
src/java/org/apache/nutch/util/NodeWalker.java | 6 +-
.../org/apache/nutch/util/NutchConfiguration.java | 2 +
src/java/org/apache/nutch/util/NutchJob.java | 14 ++++
src/java/org/apache/nutch/util/NutchTool.java | 24 +++++--
.../org/apache/nutch/util/PrefixStringMatcher.java | 5 +-
src/java/org/apache/nutch/util/StringUtil.java | 32 ++++++---
.../org/apache/nutch/util/SuffixStringMatcher.java | 2 +
src/java/org/apache/nutch/util/TableUtil.java | 2 +-
src/java/org/apache/nutch/util/TimingUtil.java | 2 +
.../org/apache/nutch/util/TrieStringMatcher.java | 19 ++++-
src/java/org/apache/nutch/util/URLUtil.java | 53 ++++++++++----
.../apache/nutch/util/domain/DomainSuffixes.java | 7 +-
.../org/apache/nutch/webui/client/NutchClient.java | 4 +-
.../webui/pages/components/CpmIteratorAdapter.java | 1 -
.../apache/nutch/any23/Any23IndexingFilter.java | 4 +-
.../org/apache/nutch/any23/Any23ParseFilter.java | 4 +-
.../creativecommons/nutch/CCIndexingFilter.java | 2 +
.../org/creativecommons/nutch/CCParseFilter.java | 12 +++-
.../nutch/parse/headings/HeadingsParseFilter.java | 5 ++
.../nutch/indexer/geoip/GeoIPDocumentCreator.java | 15 +++-
.../nutch/indexer/metadata/MetadataIndexer.java | 5 +-
.../nutch/indexer/replace/FieldReplacer.java | 10 ++-
.../indexer/staticfield/StaticFieldIndexer.java | 2 +
.../cloudsearch/CloudSearchIndexWriter.java | 2 +-
.../indexwriter/cloudsearch/CloudSearchUtils.java | 19 ++---
.../nutch/indexwriter/csv/CSVIndexWriter.java | 2 +-
.../nutch/indexwriter/dummy/DummyIndexWriter.java | 2 +-
.../indexwriter/elastic/ElasticIndexWriter.java | 19 ++---
.../indexwriter/rabbit/RabbitIndexWriter.java | 2 +-
.../nutch/indexwriter/solr/SolrIndexWriter.java | 3 +-
.../apache/nutch/protocol/http/api/HttpBase.java | 11 +--
.../protocol/http/api/HttpRobotRulesParser.java | 6 +-
.../org/apache/nutch/rabbitmq/RabbitMQClient.java | 6 +-
.../nutch/urlfilter/api/RegexURLFilterBase.java | 46 ++++++-------
.../indexer/filter/MimeTypeIndexingFilter.java | 12 +---
.../org/apache/nutch/parse/html/DOMBuilder.java | 10 +++
.../apache/nutch/parse/html/DOMContentUtils.java | 24 ++++++-
.../apache/nutch/parse/html/HTMLMetaProcessor.java | 4 ++
.../org/apache/nutch/parse/js/JSParseFilter.java | 4 +-
.../java/org/apache/nutch/parse/swf/SWFParser.java | 4 +-
.../apache/nutch/parse/tika/DOMContentUtils.java | 19 ++++-
.../apache/nutch/parse/tika/HTMLMetaProcessor.java | 4 ++
.../apache/nutch/parse/zip/ZipTextExtractor.java | 5 +-
.../java/org/apache/nutch/protocol/file/File.java | 4 ++
.../apache/nutch/protocol/file/FileResponse.java | 28 +++++---
.../java/org/apache/nutch/protocol/ftp/Client.java | 78 +++++++++++----------
.../java/org/apache/nutch/protocol/ftp/Ftp.java | 34 +++++++--
.../org/apache/nutch/protocol/ftp/FtpResponse.java | 11 ++-
.../protocol/htmlunit/DummyX509TrustManager.java | 11 +++
.../org/apache/nutch/protocol/htmlunit/Http.java | 2 +-
.../nutch/protocol/htmlunit/HttpResponse.java | 11 +--
.../nutch/protocol/http/DummyX509TrustManager.java | 11 +++
.../java/org/apache/nutch/protocol/http/Http.java | 2 +-
.../apache/nutch/protocol/http/HttpResponse.java | 11 +--
.../protocol/httpclient/DummyX509TrustManager.java | 11 +++
.../org/apache/nutch/protocol/httpclient/Http.java | 2 +
.../httpclient/HttpBasicAuthentication.java | 4 ++
.../interactiveselenium/DummyX509TrustManager.java | 11 +++
.../handlers/DefaultClickAllAjaxLinksHandler.java | 2 +-
.../protocol/selenium/DummyX509TrustManager.java | 11 +++
.../org/apache/nutch/protocol/selenium/Http.java | 2 -
.../nutch/scoring/orphan/OrphanScoringFilter.java | 3 +-
.../nutch/scoring/similarity/cosine/Model.java | 1 +
.../similarity/util/LuceneAnalyzerUtil.java | 20 +++++-
.../scoring/similarity/util/LuceneTokenizer.java | 20 ++++--
.../apache/nutch/collection/CollectionManager.java | 15 ++--
.../org/apache/nutch/collection/Subcollection.java | 22 +++---
.../subcollection/SubcollectionIndexingFilter.java | 2 +-
.../nutch/urlfilter/suffix/SuffixURLFilter.java | 4 +-
.../net/urlnormalizer/ajax/AjaxURLNormalizer.java | 16 +++--
.../net/urlnormalizer/basic/package-info.java | 2 +-
.../urlnormalizer/regex/RegexURLNormalizer.java | 20 +++++-
164 files changed, 1317 insertions(+), 573 deletions(-)
diff --git a/build.xml b/build.xml
index 57ec4fa..ec003c3 100644
--- a/build.xml
+++ b/build.xml
@@ -185,9 +185,11 @@
windowtitle="${name} ${version} API"
doctitle="${name} ${version} API"
bottom="Copyright &copy; ${year} The Apache Software Foundation"
+ failonerror="true"
>
<arg value="${javadoc.proxy.host}"/>
<arg value="${javadoc.proxy.port}"/>
+ <arg value="--allow-script-in-comments"/>
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/any23/src/java/" />
@@ -273,7 +275,7 @@
<classpath refid="classpath"/>
<classpath>
- <fileset dir="${plugins.dir}" >
+ <fileset dir="${build.plugins}" >
<include name="**/*.jar"/>
</fileset>
</classpath>
@@ -292,6 +294,7 @@
<group title="Indexer Plugins" packages="${plugins.indexer}"/>
<group title="Misc. Plugins" packages="${plugins.misc}"/>
</javadoc>
+
<jar jarfile="${maven-javadoc-jar}">
<fileset dir="${release.dir}/javadoc" />
</jar>
@@ -704,6 +707,7 @@
</condition>
</fail>
<mkdir dir="${build.javadoc}"/>
+ <mkdir dir="${build.javadoc}/resources"/>
<javadoc
overview="${src.dir}/overview.html"
destdir="${build.javadoc}"
@@ -713,9 +717,11 @@
windowtitle="${name} ${version} API"
doctitle="${name} ${version} API"
bottom="Copyright &copy; ${year} The Apache Software Foundation"
+ failonerror="true"
>
<arg value="${javadoc.proxy.host}"/>
<arg value="${javadoc.proxy.port}"/>
+ <arg value="--allow-script-in-comments"/>
<packageset dir="${src.dir}"/>
<packageset dir="${plugins.dir}/any23/src/java/" />
@@ -795,13 +801,13 @@
<link href="${javadoc.link.java}"/>
<link href="${javadoc.link.hadoop}"/>
- <link href="${javadoc.link.lucene.core}"/>
+ <!--link href="${javadoc.link.lucene.core}"/>
<link href="${javadoc.link.lucene.analyzers-common}"/>
- <link href="${javadoc.link.solr-solrj}"/>
+ <link href="${javadoc.link.solr-solrj}"/-->
<classpath refid="classpath"/>
<classpath>
- <fileset dir="${plugins.dir}" >
+ <fileset dir="${build.plugins}" >
<include name="**/*.jar"/>
</fileset>
</classpath>
@@ -829,7 +835,7 @@
<copy file="${conf.dir}/configuration.xsl" todir="${build.javadoc}/resources/"/>
</target>
- <!-- ================================================================== -->
+ <!-- ================================================================== -->
<!-- D I S T R I B U T I O N -->
<!-- ================================================================== -->
<!-- -->
diff --git a/default.properties b/default.properties
index 48bdb43..f250904 100644
--- a/default.properties
+++ b/default.properties
@@ -45,9 +45,9 @@ javadoc.proxy.host=-J-DproxyHost=
javadoc.proxy.port=-J-DproxyPort=
javadoc.link.java=https://docs.oracle.com/javase/8/docs/api/
javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.1.3/api/
-javadoc.link.lucene.core=https://lucene.apache.org/core/8_5_1/core/
-javadoc.link.lucene.analyzers-common=https://lucene.apache.org/core/8_5_1/analyzers-common/
-javadoc.link.solr-solrj=https://lucene.apache.org/solr/8_5_1/solr-solrj/
+#javadoc.link.lucene.core=https://lucene.apache.org/core/8_4_1/core/
+#javadoc.link.lucene.analyzers-common=https://lucene.apache.org/core/8_4_1/analyzers-common/
+#javadoc.link.solr-solrj=https://lucene.apache.org/solr/8_4_1/solr-solrj/
javadoc.packages=org.apache.nutch.*
dist.dir=./dist
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index b18eda3..bf51eb0 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -199,8 +199,10 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
}
/**
- * Returns either the time of the last fetch, or the next fetch time,
- * depending on whether Fetcher or CrawlDbReducer set the time.
+ * Get the fetch time.
+ * @return long value indicating either the time of the last
+ * fetch, or the next fetch time, depending on whether Fetcher
+ * or CrawlDbReducer set the time.
*/
public long getFetchTime() {
return fetchTime;
@@ -209,6 +211,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
/**
* Sets either the time of the last fetch or the next fetch time, depending on
* whether Fetcher or CrawlDbReducer set the time.
+ * @param fetchTime the fetch time to set.
*/
public void setFetchTime(long fetchTime) {
this.fetchTime = fetchTime;
@@ -278,7 +281,9 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
}
/**
- * returns a MapWritable if it was set or read in @see readFields(DataInput),
+ * Get CrawlDatum metadata
+ * @see CrawlDatum#readFields(DataInput)
+ * @return a MapWritable if it was set or read in #readFields(DataInput),
* returns empty map in case CrawlDatum was freshly created (lazily
* instantiated).
*/
@@ -379,7 +384,10 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
}
}
- /** Copy the contents of another instance into this instance. */
+ /**
+ * Copy the contents of another instance into this instance.
+ * @param that an existing {@link CrawlDatum}
+ */
public void set(CrawlDatum that) {
this.status = that.status;
this.fetchTime = that.fetchTime;
@@ -400,7 +408,14 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
// compare methods
//
- /** Sort by decreasing score. */
+ /**
+ * Sort two {@link CrawlDatum} objects by decreasing score.
+ * @param that an existing {@link CrawlDatum}
+ * @return 1 if any one field (score, status, fetchTime, retries,
+ * fetchInterval or modifiedTime) of the new {@link CrawlDatum}
+ * minus the correspoinding field of the existing {@link CrawlDatum}
+ * is greater than 0, otherwise return -1.
+ */
public int compareTo(CrawlDatum that) {
if (that.score != this.score)
return (that.score - this.score) > 0 ? 1 : -1;
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
index 6cf2809..6fef03a 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
@@ -188,7 +188,9 @@ public class CrawlDbMerger extends Configured implements Tool {
}
/**
- * @param args
+ * Run the tool.
+ * @param args job parameters
+ * @throws Exception if there is an issue executing this job
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(),
diff --git a/src/java/org/apache/nutch/crawl/FetchSchedule.java b/src/java/org/apache/nutch/crawl/FetchSchedule.java
index 384c2d6..616ded8 100644
--- a/src/java/org/apache/nutch/crawl/FetchSchedule.java
+++ b/src/java/org/apache/nutch/crawl/FetchSchedule.java
@@ -114,7 +114,9 @@ public interface FetchSchedule extends Configurable {
*
* @param datum
* datum instance to be adjusted.
- *
+ * @param prevFetchTime previous fetch time.
+ * @param prevModifiedTime previous modified time.
+ * @param fetchTime current fetch time.
* @return adjusted page information, including all original information.
* NOTE: this may be a different instance than @see CrawlDatum, but
* implementations should make sure that it contains at least all
@@ -153,7 +155,7 @@ public interface FetchSchedule extends Configurable {
/**
* Calculates last fetch time of the given CrawlDatum.
- *
+ * @param datum page information.
* @return the date as a long.
*/
public long calculateLastFetchTime(CrawlDatum datum);
diff --git a/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java b/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
index e07d771..9cad94c 100644
--- a/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
+++ b/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java
@@ -30,9 +30,15 @@ public class FetchScheduleFactory {
.getLogger(MethodHandles.lookup().lookupClass());
private FetchScheduleFactory() {
- } // no public ctor
+ } // no public constructor
- /** Return the FetchSchedule implementation. */
+ /**
+ * Return the FetchSchedule implementation specified within
+ * the given {@link Configuration}, or {@link DefaultFetchSchedule}
+ * by default.
+ * @param conf a populated {@link Configuration} object
+ * @return a synchronized, static {@link FetchSchedule}
+ */
public synchronized static FetchSchedule getFetchSchedule(Configuration conf) {
String clazz = conf.get("db.fetch.schedule.class",
DefaultFetchSchedule.class.getName());
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index c3f4469..dcba9bf 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -35,6 +35,7 @@ import org.apache.hadoop.conf.Configurable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.jexl3.JexlExpression;
+import org.antlr.v4.parse.ANTLRParser.throwsSpec_return;
import org.apache.commons.jexl3.JexlContext;
import org.apache.commons.jexl3.MapContext;
import org.apache.hadoop.mapreduce.Counter;
@@ -708,9 +709,34 @@ public class Generator extends NutchTool implements Tool {
}
/**
- * old signature used for compatibility - does not specify whether or not to
+ * This is an old signature used for compatibility - does not specify whether or not to
* normalise and set the number of segments to 1
+ * @param dbDir
+ * Crawl database directory
+ * @param segments
+ * Segments directory
+ * @param numLists
+ * Number of reduce tasks
+ * @param topN
+ * Number of top URLs to be selected
+ * @param curTime
+ * Current time in milliseconds
+ * @param filter whether to apply filtering operation
+ * @param force if true, and the target lockfile exists, consider it valid. If false
+ * and the target file exists, throw an IOException.
+ * @deprecated since 1.19 use
+ * {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String, String)}
+ * or {@link #generate(Path, Path, int, long, long, boolean, boolean, boolean, int, String)}
+ * in the instance that no hostdb is available
+ * @throws IOException if an I/O exception occurs.
+ * @see LockUtil#createLockFile(Configuration, Path, boolean)
+ * @throws InterruptedException if a thread is waiting, sleeping, or
+ * otherwise occupied, and the thread is interrupted, either before or
+ * during the activity.
+ * @throws ClassNotFoundException if runtime class(es) are not available
+ * @return Path to generated segment or null if no entries were selected
**/
+ @Deprecated
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime, boolean filter, boolean force)
throws IOException, InterruptedException, ClassNotFoundException {
@@ -718,6 +744,37 @@ public class Generator extends NutchTool implements Tool {
force, 1, null);
}
+ /**
+ * This signature should be used in the instance that no hostdb is available.
+ * Generate fetchlists in one or more segments. Whether to filter URLs or not
+ * is read from the "generate.filter" property set for the job from
+ * command-line. If the property is not found, the URLs are filtered. Same for
+ * the normalisation.
+ * @param dbDir
+ * Crawl database directory
+ * @param segments
+ * Segments directory
+ * @param numLists
+ * Number of reduce tasks
+ * @param topN
+ * Number of top URLs to be selected
+ * @param curTime
+ * Current time in milliseconds
+ * @param filter whether to apply filtering operation
+ * @param norm whether to apply normilization operation
+ * @param force if true, and the target lockfile exists, consider it valid. If false
+ * and the target file exists, throw an IOException.
+ * @param maxNumSegments maximum number of segments to generate
+ * @param expr a Jexl expression to use in the Generator job.
+ * @see JexlUtil#parseExpression(String)
+ * @throws IOException if an I/O exception occurs.
+ * @see LockUtil#createLockFile(Configuration, Path, boolean)
+ * @throws InterruptedException if a thread is waiting, sleeping, or
+ * otherwise occupied, and the thread is interrupted, either before or
+ * during the activity.
+ * @throws ClassNotFoundException if runtime class(es) are not available
+ * @return Path to generated segment or null if no entries were selected
+ **/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime, boolean filter, boolean norm, boolean force,
int maxNumSegments, String expr)
@@ -742,11 +799,22 @@ public class Generator extends NutchTool implements Tool {
* Number of top URLs to be selected
* @param curTime
* Current time in milliseconds
- *
+ * @param filter whether to apply filtering operation
+ * @param norm whether to apply normilization operation
+ * @param force if true, and the target lockfile exists, consider it valid. If false
+ * and the target file exists, throw an IOException.
+ * @param maxNumSegments maximum number of segments to generate
+ * @param expr a Jexl expression to use in the Generator job.
+ * @param hostdb name of a hostdb from which to execute Jexl expressions in a bid
+ * to determine the maximum URL count and/or fetch delay per host.
+ * @see JexlUtil#parseExpression(String)
+ * @throws IOException if an I/O exception occurs.
+ * @see LockUtil#createLockFile(Configuration, Path, boolean)
+ * @throws InterruptedException if a thread is waiting, sleeping, or
+ * otherwise occupied, and the thread is interrupted, either before or
+ * during the activity.
+ * @throws ClassNotFoundException if runtime class(es) are not available
* @return Path to generated segment or null if no entries were selected
- *
- * @throws IOException
- * When an I/O error occurs
*/
public Path[] generate(Path dbDir, Path segments, int numLists, long topN,
long curTime, boolean filter, boolean norm, boolean force,
@@ -993,6 +1061,8 @@ public class Generator extends NutchTool implements Tool {
/**
* Generate a fetchlist from the crawldb.
+ * @param args array of arguments for this job
+ * @throws Exception if there is an error running the job
*/
public static void main(String args[]) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new Generator(),
diff --git a/src/java/org/apache/nutch/crawl/Inlink.java b/src/java/org/apache/nutch/crawl/Inlink.java
index 6010d07..d303882 100644
--- a/src/java/org/apache/nutch/crawl/Inlink.java
+++ b/src/java/org/apache/nutch/crawl/Inlink.java
@@ -23,7 +23,7 @@ import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
-/* An incoming link to a page. */
+/** An incoming link to a page. */
public class Inlink implements Writable {
private String fromUrl;
@@ -42,7 +42,11 @@ public class Inlink implements Writable {
anchor = Text.readString(in);
}
- /** Skips over one Inlink in the input. */
+ /**
+ * Skips over one Inlink in the input.
+ * @param in the tuple containing the fromUrl and anchor data
+ * @throws IOException if there is an error reading the Inlink tuple
+ */
public static void skip(DataInput in) throws IOException {
Text.skip(in); // skip fromUrl
Text.skip(in); // skip anchor
diff --git a/src/java/org/apache/nutch/crawl/Inlinks.java b/src/java/org/apache/nutch/crawl/Inlinks.java
index 40ac6e2..ae84138 100644
--- a/src/java/org/apache/nutch/crawl/Inlinks.java
+++ b/src/java/org/apache/nutch/crawl/Inlinks.java
@@ -82,8 +82,9 @@ public class Inlinks implements Writable {
}
/**
- * Return the set of anchor texts. Only a single anchor with a given text is
+ * Get all anchor texts. Only a single anchor with a given text is
* permitted from a given domain.
+ * @return the set of anchor texts.
*/
public String[] getAnchors() {
HashMap<String, Set<String>> domainToAnchors = new HashMap<>();
diff --git a/src/java/org/apache/nutch/crawl/LinkDbMerger.java b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
index 059dbcd..6b93329 100644
--- a/src/java/org/apache/nutch/crawl/LinkDbMerger.java
+++ b/src/java/org/apache/nutch/crawl/LinkDbMerger.java
@@ -174,7 +174,9 @@ public class LinkDbMerger extends Configured implements Tool {
}
/**
- * @param args
+ * Run the job
+ * @param args input arguments for the job
+ * @throws Exception if there is an error running the job
*/
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(),
diff --git a/src/java/org/apache/nutch/crawl/SignatureFactory.java b/src/java/org/apache/nutch/crawl/SignatureFactory.java
index e605ec5..6037603 100644
--- a/src/java/org/apache/nutch/crawl/SignatureFactory.java
+++ b/src/java/org/apache/nutch/crawl/SignatureFactory.java
@@ -37,9 +37,14 @@ public class SignatureFactory {
.getLogger(MethodHandles.lookup().lookupClass());
private SignatureFactory() {
- } // no public ctor
+ } // no public constructor
- /** Return the default Signature implementation. */
+ /**
+ * Return the {@link Signature} implementation for a given
+ * {@link Configuration}, or {@link MD5Signature} by default.
+ * @param conf a populated {@link Configuration}
+ * @return the {@link Signature} implementation
+ */
public synchronized static Signature getSignature(Configuration conf) {
String clazz = conf.get("db.signature.class", MD5Signature.class.getName());
ObjectCache objectCache = ObjectCache.get(conf);
diff --git a/src/java/org/apache/nutch/fetcher/FetchItem.java b/src/java/org/apache/nutch/fetcher/FetchItem.java
index f56ed25..b3b2bc2 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItem.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItem.java
@@ -58,12 +58,32 @@ public class FetchItem {
/**
* Create an item. Queue id will be created based on <code>queueMode</code>
* argument, either as a protocol + hostname pair, protocol + IP address
- * pair or protocol+domain pair.
+ * pair or protocol+domain pair. Sets outlink depth to 0.
+ * @param url URL of fetch item
+ * @param datum webpage information associated with the URL
+ * @param queueMode either byHost, byDomain or byIP.
+ * @see FetchItemQueues#QUEUE_MODE_DOMAIN
+ * @see FetchItemQueues#QUEUE_MODE_HOST
+ * @see FetchItemQueues#QUEUE_MODE_IP
+ * @return a {@link FetchItem} with outlinks depth of 0
*/
public static FetchItem create(Text url, CrawlDatum datum, String queueMode) {
return create(url, datum, queueMode, 0);
}
+ /**
+ * Create an item. Queue id will be created based on <code>queueMode</code>
+ * argument, either as a protocol + hostname pair, protocol + IP address
+ * pair or protocol+domain pair. Configurable outlink depth.
+ * @param url URL of fetch item
+ * @param datum webpage information associated with the URL
+ * @param queueMode either byHost, byDomain or byIP
+ * @param outlinkDepth the desired depth of outlink for this given FetchItem
+ * @see FetchItemQueues#QUEUE_MODE_DOMAIN
+ * @see FetchItemQueues#QUEUE_MODE_HOST
+ * @see FetchItemQueues#QUEUE_MODE_IP
+ * @return a {@link FetchItem}
+ */
public static FetchItem create(Text url, CrawlDatum datum,
String queueMode, int outlinkDepth) {
URL u = null;
diff --git a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
index ce7b2b6..00a0784 100644
--- a/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
+++ b/src/java/org/apache/nutch/fetcher/FetchItemQueues.java
@@ -235,7 +235,7 @@ public class FetchItemQueues {
* Increment the exception counter of a queue in case of an exception e.g.
* timeout; when higher than a given threshold simply empty the queue.
*
- * @param queueid
+ * @param queueid a queue identifier to locate and check
* @return number of purged items
*/
public synchronized int checkExceptionThreshold(String queueid) {
diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java
index 568bf8e..18d0fd5 100644
--- a/src/java/org/apache/nutch/fetcher/Fetcher.java
+++ b/src/java/org/apache/nutch/fetcher/Fetcher.java
@@ -513,7 +513,11 @@ public class Fetcher extends NutchTool implements Tool {
TimingUtil.logDateMillis(end), TimingUtil.elapsedTime(start, end));
}
- /** Run the fetcher. */
+ /**
+ * Run the fetcher.
+ * @param args input parameters for the job
+ * @throws Exception if a fatal error arises whilst running the job
+ */
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new Fetcher(), args);
System.exit(res);
diff --git a/src/java/org/apache/nutch/fetcher/FetcherThreadEvent.java b/src/java/org/apache/nutch/fetcher/FetcherThreadEvent.java
index 6c175c8..c5eeafc 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherThreadEvent.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherThreadEvent.java
@@ -26,10 +26,10 @@ import org.apache.nutch.parse.Outlink;
/**
* This class is used to capture the various events occurring
- * at fetch time. These events are sent to a queue implementing the publisher
- *
+ * at fetch time. These events are sent to a
+ * {@link org.apache.nutch.publisher.NutchPublisher} implementation.
*/
-public class FetcherThreadEvent implements Serializable{
+public class FetcherThreadEvent implements Serializable {
/** Type of event to specify start, end or reporting of a fetch item. **/
public static enum PublishEventType {START, END, REPORT}
@@ -41,8 +41,8 @@ public class FetcherThreadEvent implements Serializable{
/**
* Constructor to create an event to be published
- * @param eventType Type of {@link #eventType event} being created
- * @param url URL of the fetched page to which this event belongs to
+ * @param eventType Type of {@link #eventType event} being created
+ * @param url URL of the fetched page to which this event belongs to
*/
public FetcherThreadEvent(PublishEventType eventType, String url) {
this.eventType = eventType;
@@ -60,7 +60,7 @@ public class FetcherThreadEvent implements Serializable{
/**
* Set event type of this object
- * @param eventType Set {@link #eventType event} type
+ * @param eventType Set {@link #eventType event} type
*/
public void setEventType(PublishEventType eventType) {
this.eventType = eventType;
@@ -68,16 +68,16 @@ public class FetcherThreadEvent implements Serializable{
/**
* Get event data
- * @return
+ * @return a Map of event data
*/
public Map<String, Object> getEventData() {
return eventData;
}
/**
* Set metadata to this even
- * @param eventData A map containing important information relevant
- * to this event (fetched page).
- * Ex - score, title, outlinks, content-type, etc
+ * @param eventData A map containing important information relevant
+ * to this event (fetched page).
+ * Exeample - score, title, outlinks, content-type, etc
*/
public void setEventData(Map<String, Object> eventData) {
this.eventData = eventData;
diff --git a/src/java/org/apache/nutch/hostdb/HostDatum.java b/src/java/org/apache/nutch/hostdb/HostDatum.java
index 2bc9244..3698be0 100644
--- a/src/java/org/apache/nutch/hostdb/HostDatum.java
+++ b/src/java/org/apache/nutch/hostdb/HostDatum.java
@@ -213,8 +213,9 @@ public class HostDatum implements Writable, Cloneable {
}
/**
- * returns a MapWritable if it was set or read in @see readFields(DataInput),
- * returns empty map in case CrawlDatum was freshly created (lazily instantiated).
+ * Get Host metadata.
+ * @return a {@link MapWritable} if it was set or read in {@link #readFields(DataInput)},
+ * OR returns empty map in case {@link HostDatum} was freshly created (lazily instantiated).
*/
public org.apache.hadoop.io.MapWritable getMetaData() {
if (this.metaData == null) this.metaData = new org.apache.hadoop.io.MapWritable();
diff --git a/src/java/org/apache/nutch/hostdb/ResolverThread.java b/src/java/org/apache/nutch/hostdb/ResolverThread.java
index 564e5da..cf749f4 100644
--- a/src/java/org/apache/nutch/hostdb/ResolverThread.java
+++ b/src/java/org/apache/nutch/hostdb/ResolverThread.java
@@ -42,7 +42,13 @@ public class ResolverThread implements Runnable {
protected int purgeFailedHostsThreshold;
/**
- * Constructor.
+ * Overloaded constructor.
+ * @param host name of the host to lookup
+ * @param datum accompanying host information
+ * @param context {@link org.apache.hadoop.mapreduce.Reducer.Context} for
+ * writing custom counters and output.
+ * @param purgeFailedHostsThreshold int value which marks the maximum failed
+ * DNS lookups before a given host is purged from the hostdb
*/
public ResolverThread(String host, HostDatum datum,
Context context, int purgeFailedHostsThreshold) {
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
index 9657621..f642850 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbMapper.java
@@ -72,15 +72,12 @@ public class UpdateHostDbMapper
}
/**
- * Filters and or normalizes the input URL
+ * Filters and or normalizes the input hostname
*
- * @param url
- * @return String
+ * @param url the input hostname
+ * @return the processed hostname, or null if there was a fatal error
*/
protected String filterNormalize(String url) {
- // We actually receive a hostname here so let's make a URL
- // TODO: we force shop.fcgroningen to be https, how do we know that here?
- // http://issues.openindex.io/browse/SPIDER-40
url = "http://" + url + "/";
try {
@@ -102,9 +99,10 @@ public class UpdateHostDbMapper
* Mapper ingesting records from the HostDB, CrawlDB and plaintext host
* scores file. Statistics and scores are passed on.
*
- * @param key
- * @param value
- * @param context
+ * @param key record {@link org.apache.hadoop.io.Text} key
+ * @param value associated {@link org.apache.hadoop.io.Writable} object
+ * @param context {@link org.apache.hadoop.mapreduce.Reducer.Context} for
+ * writing custom counters and output.
*/
@Override
public void map(Text key, Writable value,
diff --git a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
index f473848..a5dd11e 100644
--- a/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
+++ b/src/java/org/apache/nutch/hostdb/UpdateHostDbReducer.java
@@ -359,8 +359,8 @@ public class UpdateHostDbReducer
/**
* Determines whether a record should be checked.
*
- * @param datum
- * @return boolean
+ * @param datum a {@link HostDatum} to check for eligibility
+ * @return true if it should be checked, false otherwise
*/
protected boolean shouldCheck(HostDatum datum) {
// Whether a new record is to be checked
@@ -385,8 +385,8 @@ public class UpdateHostDbReducer
/**
* Determines whether a record is eligible for recheck.
*
- * @param datum
- * @return boolean
+ * @param datum a {@link HostDatum} to check for eligibility
+ * @return true if eligible for recheck, false otherwise
*/
protected boolean isEligibleForCheck(HostDatum datum) {
// Whether an existing host, known or unknown, if forced to be rechecked
diff --git a/src/java/org/apache/nutch/indexer/IndexWriter.java b/src/java/org/apache/nutch/indexer/IndexWriter.java
index 7866159..43d4a48 100644
--- a/src/java/org/apache/nutch/indexer/IndexWriter.java
+++ b/src/java/org/apache/nutch/indexer/IndexWriter.java
@@ -31,6 +31,9 @@ public interface IndexWriter extends Pluggable, Configurable {
final static String X_POINT_ID = IndexWriter.class.getName();
/**
+ * @param conf Nutch configuration
+ * @param name target name of the {@link IndexWriter} to be opened
+ * @throws IOException Some exception thrown by some writer.
* @deprecated use {@link #open(IndexWriterParams)}} instead.
*/
@Deprecated
@@ -57,7 +60,7 @@ public interface IndexWriter extends Pluggable, Configurable {
/**
* Returns {@link Map} with the specific parameters the IndexWriter instance can take.
*
- * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
+ * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
*/
Map<String, Map.Entry<String, Object>> describe();
}
diff --git a/src/java/org/apache/nutch/indexer/IndexWriters.java b/src/java/org/apache/nutch/indexer/IndexWriters.java
index 04cc70a..a8ab0ec 100644
--- a/src/java/org/apache/nutch/indexer/IndexWriters.java
+++ b/src/java/org/apache/nutch/indexer/IndexWriters.java
@@ -204,8 +204,8 @@ public class IndexWriters {
/**
* Initializes the internal variables of index writers.
*
- * @param conf Nutch configuration.
- * @param name
+ * @param conf Nutch configuration
+ * @param name target name of the {@link IndexWriter} to be opened
* @throws IOException Some exception thrown by some writer.
*/
public void open(Configuration conf, String name) throws IOException {
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 42093b7..1b8ff52 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -64,7 +64,7 @@ import org.apache.nutch.scoring.ScoringFilters;
* </p>
* <p>
* See
- * {@link org.apache.nutch.indexer.IndexerMapReduce#initMRJob(Path, Path, Collection, JobConf, boolean)}
+ * {@link #initMRJob(Path, Path, Collection, Job, boolean)}
* for details on the specific data structures and parameters required for
* indexing.
* </p>
diff --git a/src/java/org/apache/nutch/indexer/IndexingFilter.java b/src/java/org/apache/nutch/indexer/IndexingFilter.java
index 2494167..4774aab 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFilter.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFilter.java
@@ -51,7 +51,7 @@ public interface IndexingFilter extends Pluggable, Configurable {
* page inlinks
* @return modified (or a new) document instance, or null (meaning the
* document should be discarded)
- * @throws IndexingException
+ * @throws IndexingException if an error occurs during during filtering
*/
NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException;
diff --git a/src/java/org/apache/nutch/indexer/IndexingFilters.java b/src/java/org/apache/nutch/indexer/IndexingFilters.java
index 8985297..87a2929 100644
--- a/src/java/org/apache/nutch/indexer/IndexingFilters.java
+++ b/src/java/org/apache/nutch/indexer/IndexingFilters.java
@@ -36,7 +36,18 @@ public class IndexingFilters {
INDEXINGFILTER_ORDER);
}
- /** Run all defined filters. */
+ /**
+ * Run all defined filters. Note, may return null if the the document
+ * was filtered
+ * @see IndexingFilter#filter(NutchDocument, Parse, Text, CrawlDatum, Inlinks)
+ * @param doc the {@link NutchDocument} to process with filters
+ * @param parse corresponding {@link Parse} object for the document
+ * @param url corresponding {@link org.apache.hadoop.io.Text} url for the document
+ * @param datum corresponding {@link CrawlDatum} for the document
+ * @param inlinks corresponding {@link Inlinks} for the document
+ * @return the {@link NutchDocument}, null it the document was filtered
+ * @throws IndexingException if an error occurs within a filter
+ */
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
for (int i = 0; i < this.indexingFilters.length; i++) {
@@ -45,7 +56,6 @@ public class IndexingFilters {
if (doc == null)
return null;
}
-
return doc;
}
diff --git a/src/java/org/apache/nutch/metadata/MetaWrapper.java b/src/java/org/apache/nutch/metadata/MetaWrapper.java
index 0fe72c9..a58253c 100644
--- a/src/java/org/apache/nutch/metadata/MetaWrapper.java
+++ b/src/java/org/apache/nutch/metadata/MetaWrapper.java
@@ -56,39 +56,36 @@ public class MetaWrapper extends NutchWritable {
/**
* Get all metadata.
+ * @return a populated {@link Metadata} object
*/
public Metadata getMetadata() {
return metadata;
}
/**
- * Add metadata. See {@link Metadata#add(String, String)} for more
- * information.
- *
- * @param name
- * metadata name
- * @param value
- * metadata value
+ * Add metadata.
+ * @see Metadata#add(String, String)
+ * @param name metadata name to add
+ * @param value metadata value to add
*/
public void addMeta(String name, String value) {
metadata.add(name, value);
}
/**
- * Set metadata. See {@link Metadata#set(String, String)} for more
- * information.
- *
- * @param name
- * @param value
+ * Set metadata.
+ * @see Metadata#set(String, String)
+ * @param name metadata key to set
+ * @param value metadata value to set
*/
public void setMeta(String name, String value) {
metadata.set(name, value);
}
/**
- * Get metadata. See {@link Metadata#get(String)} for more information.
- *
- * @param name
+ * Get metadata value for a given key.
+ * @see Metadata#getValues(String)
+ * @param name key to retrieve a value for
* @return metadata value
*/
public String getMeta(String name) {
@@ -96,11 +93,10 @@ public class MetaWrapper extends NutchWritable {
}
/**
- * Get multiple metadata. See {@link Metadata#getValues(String)} for more
- * information.
- *
- * @param name
- * @return multiple values
+ * Get multiple metadata values for a given key.
+ * @see Metadata#getValues(String)
+ * @param name key to retrieve values for
+ * @return a string array containing metadata values
*/
public String[] getMetaValues(String name) {
return metadata.getValues(name);
diff --git a/src/java/org/apache/nutch/net/URLExemptionFilters.java b/src/java/org/apache/nutch/net/URLExemptionFilters.java
index c61f43f..c730228 100644
--- a/src/java/org/apache/nutch/net/URLExemptionFilters.java
+++ b/src/java/org/apache/nutch/net/URLExemptionFilters.java
@@ -48,8 +48,13 @@ public class URLExemptionFilters {
URLExemptionFilter.X_POINT_ID);
}
-
- /** Run all defined filters. Assume logical AND. */
+ /**
+ * Run all defined filters. Assume logical AND.
+ * An URL is exempted when all the filters accept it to pass through
+ * @param fromUrl the source url which generated the outlink
+ * @param toUrl the destination url which needs to be checked for exemption
+ * @return true is exempted, false otherwise
+ */
public boolean isExempted(String fromUrl, String toUrl) {
if (filters.length < 1) {
//at least one filter should be on
diff --git a/src/java/org/apache/nutch/net/URLFilters.java b/src/java/org/apache/nutch/net/URLFilters.java
index f8f8186..ed58650 100644
--- a/src/java/org/apache/nutch/net/URLFilters.java
+++ b/src/java/org/apache/nutch/net/URLFilters.java
@@ -34,7 +34,13 @@ public class URLFilters {
return this.filters;
}
- /** Run all defined filters. Assume logical AND. */
+ /**
+ * Run all defined filters. Assume logical AND.
+ * @param urlString to execute filters on
+ * @return filtered result
+ * @throws URLFilterException if there is an issue executing
+ * any URLFilter implementations.
+ */
public String filter(String urlString) throws URLFilterException {
for (int i = 0; i < this.filters.length; i++) {
if (urlString == null)
diff --git a/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java b/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
index 14447b5..b000690 100644
--- a/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
+++ b/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
@@ -59,6 +59,8 @@ public class HttpDateFormat {
/**
* Get the HTTP format of the specified date.
+ * @param date a {@link java.util.Date} for conversion
+ * @return the String HTTP representation of the date
*/
public static String toString(Date date) {
return FORMAT.format(date.toInstant());
diff --git a/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
index 28d8894..967a2ac 100644
--- a/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
+++ b/src/java/org/apache/nutch/net/protocols/ProtocolLogUtil.java
@@ -72,6 +72,9 @@ public class ProtocolLogUtil implements Configurable {
* without stack trace, usually done for frequent exceptions with obvious
* reasons (e.g., UnknownHostException), configurable by
* <code>http.log.exceptions.suppress.stack</code>
+ * @param t a {@link java.lang.Throwable} implementation associated with protocol activity
+ * @return true if exception is configured to be logged as short message
+ * without stack trace
*/
public boolean logShort(Throwable t) {
if (exceptionsLogShort.contains(t.getClass())) {
diff --git a/src/java/org/apache/nutch/net/protocols/Response.java b/src/java/org/apache/nutch/net/protocols/Response.java
index 16dd698..0159358 100644
--- a/src/java/org/apache/nutch/net/protocols/Response.java
+++ b/src/java/org/apache/nutch/net/protocols/Response.java
@@ -72,19 +72,35 @@ public interface Response extends HttpHeaders {
UNSPECIFIED
};
- /** Returns the URL used to retrieve this response. */
+ /**
+ * Get the URL used to retrieve this response.
+ * @return {@link java.net.URL}
+ */
public URL getUrl();
- /** Returns the response code. */
+ /**
+ * Get the response code.
+ * @return protocol response code (int)
+ * */
public int getCode();
- /** Returns the value of a named header. */
+ /**
+ * Get the value of a named header.
+ * @param name key of the header you wish to retreive
+ * @return header value
+ */
public String getHeader(String name);
- /** Returns all the headers. */
+ /**
+ * Get all the headers.
+ * @return populated headers {@link org.apache.nutch.metadata.Metadata}
+ */
public Metadata getHeaders();
- /** Returns the full content of the response. */
+ /**
+ * Get the full content of the response.
+ * @return a byte array representing the response content
+ */
public byte[] getContent();
}
diff --git a/src/java/org/apache/nutch/parse/HTMLMetaTags.java b/src/java/org/apache/nutch/parse/HTMLMetaTags.java
index 23a9339..4e20a5f 100644
--- a/src/java/org/apache/nutch/parse/HTMLMetaTags.java
+++ b/src/java/org/apache/nutch/parse/HTMLMetaTags.java
@@ -104,6 +104,7 @@ public class HTMLMetaTags {
/**
* Sets <code>refresh</code> to the supplied value.
+ * @param refresh value to set
*/
public void setRefresh(boolean refresh) {
this.refresh = refresh;
@@ -111,6 +112,7 @@ public class HTMLMetaTags {
/**
* Sets the <code>baseHref</code>.
+ * @param baseHref value to set
*/
public void setBaseHref(URL baseHref) {
this.baseHref = baseHref;
@@ -118,6 +120,7 @@ public class HTMLMetaTags {
/**
* Sets the <code>refreshHref</code>.
+ * @param refreshHref value to set
*/
public void setRefreshHref(URL refreshHref) {
this.refreshHref = refreshHref;
@@ -125,41 +128,46 @@ public class HTMLMetaTags {
/**
* Sets the <code>refreshTime</code>.
+ * @param refreshTime value to set
*/
public void setRefreshTime(int refreshTime) {
this.refreshTime = refreshTime;
}
/**
- * A convenience method. Returns the current value of <code>noIndex</code>.
+ * Get the current value of <code>noIndex</code>.
+ * @return true if no index is desired, false otherwise
*/
public boolean getNoIndex() {
return noIndex;
}
/**
- * A convenience method. Returns the current value of <code>noFollow</code>.
+ * Get the current value of <code>noFollow</code>.
+ * @return true if no follow is desired, false otherwise
*/
public boolean getNoFollow() {
return noFollow;
}
/**
- * A convenience method. Returns the current value of <code>noCache</code>.
+ * Get the current value of <code>noCache</code>.
+ * @return true if no cache is desired, false otherwise
*/
public boolean getNoCache() {
return noCache;
}
/**
- * A convenience method. Returns the current value of <code>refresh</code>.
+ * Get the current value of <code>refresh</code>.
+ * @return true if refresh is desired, false otherwise
*/
public boolean getRefresh() {
return refresh;
}
/**
- * A convenience method. Returns the <code>baseHref</code>, if set, or
+ * @return the <code>baseHref</code>, if set, or
* <code>null</code> otherwise.
*/
public URL getBaseHref() {
@@ -167,17 +175,17 @@ public class HTMLMetaTags {
}
/**
- * A convenience method. Returns the <code>refreshHref</code>, if set, or
+ * @return the <code>refreshHref</code>, if set, or
* <code>null</code> otherwise. The value may be invalid if
- * {@link #getRefresh()}returns <code>false</code>.
+ * {@link #getRefresh()} returns <code>false</code>.
*/
public URL getRefreshHref() {
return refreshHref;
}
/**
- * A convenience method. Returns the current value of <code>refreshTime</code>
- * . The value may be invalid if {@link #getRefresh()}returns
+ * @return the current value of <code>refreshTime</code>
+ * . The value may be invalid if {@link #getRefresh()} returns
* <code>false</code>.
*/
public int getRefreshTime() {
@@ -185,7 +193,7 @@ public class HTMLMetaTags {
}
/**
- * Returns all collected values of the general meta tags. Property names are
+ * @return all collected values of the general meta tags. Property names are
* tag names, property values are "content" values.
*/
public Metadata getGeneralTags() {
@@ -193,7 +201,7 @@ public class HTMLMetaTags {
}
/**
- * Returns all collected values of the "http-equiv" meta tags. Property names
+ * @return all collected values of the "http-equiv" meta tags. Property names
* are tag names, property values are "content" values.
*/
public Properties getHttpEquivTags() {
diff --git a/src/java/org/apache/nutch/parse/HtmlParseFilter.java b/src/java/org/apache/nutch/parse/HtmlParseFilter.java
index cc64c8e..cf03aa3 100644
--- a/src/java/org/apache/nutch/parse/HtmlParseFilter.java
+++ b/src/java/org/apache/nutch/parse/HtmlParseFilter.java
@@ -35,6 +35,14 @@ public interface HtmlParseFilter extends Pluggable, Configurable {
/**
* Adds metadata or otherwise modifies a parse of HTML content, given the DOM
* tree of a page.
+ * @param content the {@link Content} for a given response
+ * @param parseResult the result of running on or more
+ * {@link Parser}'s on the content.
+ * @see Parser#getParse(Content)
+ * @param metaTags a populated {@link HTMLMetaTags} object
+ * @param doc a {@link DocumentFragment} (DOM) which can be processed in
+ * the filtering process.
+ * @return a filtered {@link ParseResult}
*/
ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc);
diff --git a/src/java/org/apache/nutch/parse/HtmlParseFilters.java b/src/java/org/apache/nutch/parse/HtmlParseFilters.java
index 95e23fd..30ddd21 100644
--- a/src/java/org/apache/nutch/parse/HtmlParseFilters.java
+++ b/src/java/org/apache/nutch/parse/HtmlParseFilters.java
@@ -35,7 +35,17 @@ public class HtmlParseFilters {
HTMLPARSEFILTER_ORDER);
}
- /** Run all defined filters. */
+ /**
+ * Run all defined filters.
+ * @param content the {@link Content} for a given response
+ * @param parseResult the result of running on or more
+ * {@link Parser}'s on the content.
+ * @see Parser#getParse(Content)
+ * @param metaTags a populated {@link HTMLMetaTags} object
+ * @param doc a {@link DocumentFragment} (DOM) which can be processed in
+ * the filtering process.
+ * @return a filtered {@link ParseResult}
+ */
public ParseResult filter(Content content, ParseResult parseResult,
HTMLMetaTags metaTags, DocumentFragment doc) {
diff --git a/src/java/org/apache/nutch/parse/Outlink.java b/src/java/org/apache/nutch/parse/Outlink.java
index 71e53ab..6261346 100644
--- a/src/java/org/apache/nutch/parse/Outlink.java
+++ b/src/java/org/apache/nutch/parse/Outlink.java
@@ -26,7 +26,7 @@ import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
-/* An outgoing link from a page. */
+/** An outgoing link from a page. */
public class Outlink implements Writable {
private String toUrl;
@@ -55,7 +55,12 @@ public class Outlink implements Writable {
md = null;
}
- /** Skips over one Outlink in the input. */
+ /**
+ * Skips over one Outlink in the input.
+ * @param in the {@link DataInput} tuple stream holding the
+ * toUrl and archor pair.
+ * @throws IOException if there is an error processing the {@link DataInput}
+ */
public static void skip(DataInput in) throws IOException {
Text.skip(in); // skip toUrl
Text.skip(in); // skip anchor
diff --git a/src/java/org/apache/nutch/parse/OutlinkExtractor.java b/src/java/org/apache/nutch/parse/OutlinkExtractor.java
index a9b2bb1..3e85025 100644
--- a/src/java/org/apache/nutch/parse/OutlinkExtractor.java
+++ b/src/java/org/apache/nutch/parse/OutlinkExtractor.java
@@ -49,10 +49,8 @@ public class OutlinkExtractor {
/**
* Regex pattern to get URLs within a plain text.
*
- * @see <a
- * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
-
- * </a>
+ * @see documentation on <a href="http://web.archive.org/web/20190220011657/http://www.truerwords.net/articles/ut/urlactivation.html">
+ * urlactivation</a>
*/
private static final Pattern URL_PATTERN = Pattern.compile(
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)");
@@ -64,7 +62,7 @@ public class OutlinkExtractor {
*
* @param plainText
* the plain text from wich URLs should be extracted.
- *
+ * @param conf a populated {@link Configuration}
* @return Array of <code>Outlink</code>s within found in plainText
*/
public static Outlink[] getOutlinks(final String plainText,
@@ -80,7 +78,7 @@ public class OutlinkExtractor {
* the plain text from wich URLs should be extracted.
* @param anchor
* the anchor of the url
- *
+ * @param conf a populated {@link Configuration}
* @return Array of <code>Outlink</code>s within found in plainText
*/
public static Outlink[] getOutlinks(final String plainText, String anchor,
diff --git a/src/java/org/apache/nutch/parse/Parse.java b/src/java/org/apache/nutch/parse/Parse.java
index 118178f..3fffc77 100644
--- a/src/java/org/apache/nutch/parse/Parse.java
+++ b/src/java/org/apache/nutch/parse/Parse.java
@@ -26,12 +26,19 @@ public interface Parse {
/**
* The textual content of the page. This is indexed, searched, and used when
* generating snippets.
+ * @return the entire text String
*/
String getText();
- /** Other data extracted from the page. */
+ /**
+ * Other data extracted from the page.
+ * @return a populated {@link ParseData} object
+ */
ParseData getData();
- /** Indicates if the parse is coming from a url or a sub-url */
+ /**
+ * Indicates if the parse is coming from a url or a sub-url
+ * @return true if canonical, false otherwise
+ */
boolean isCanonical();
}
diff --git a/src/java/org/apache/nutch/parse/ParseData.java b/src/java/org/apache/nutch/parse/ParseData.java
index 36d0daa..a34d8de 100644
--- a/src/java/org/apache/nutch/parse/ParseData.java
+++ b/src/java/org/apache/nutch/parse/ParseData.java
@@ -69,26 +69,34 @@ public final class ParseData extends VersionedWritable {
this.parseMeta = parseMeta;
}
- //
- // Accessor methods
- //
-
- /** The status of parsing the page. */
+ /**
+ * Get the status of parsing the page.
+ * @return the {@link ParseStatus}
+ */
public ParseStatus getStatus() {
return status;
}
- /** The title of the page. */
+ /**
+ * Get the title of the page.
+ * @return the title as a {@link String}
+ */
public String getTitle() {
return title;
}
- /** The outlinks of the page. */
+ /**
+ * Get the outlinks of the page.
+ * @return an array of {@link org.apache.nutch.parse.Outlink}'s
+ */
public Outlink[] getOutlinks() {
return outlinks;
}
- /** The original Metadata retrieved from content */
+ /**
+ * The original {@link Metadata} retrieved from content
+ * @return the original content {@link Metadata}
+ */
public Metadata getContentMeta() {
return contentMeta;
}
@@ -97,6 +105,7 @@ public final class ParseData extends VersionedWritable {
* Other content properties. This is the place to find format-specific
* properties. Different parser implementations for different content types
* will populate this differently.
+ * @return a {@link Metadata}
*/
public Metadata getParseMeta() {
return parseMeta;
@@ -114,6 +123,8 @@ public final class ParseData extends VersionedWritable {
* Get a metadata single value. This method first looks for the metadata value
* in the parse metadata. If no value is found it the looks for the metadata
* in the content metadata.
+ * @param name the metadata key for which to retrieve a value
+ * @return the (string) metadata value
*
* @see #getContentMeta()
* @see #getParseMeta()
@@ -126,10 +137,6 @@ public final class ParseData extends VersionedWritable {
return value;
}
- //
- // Writable methods
- //
-
public byte getVersion() {
return version;
}
@@ -174,10 +181,6 @@ public final class ParseData extends VersionedWritable {
return parseText;
}
- //
- // other methods
- //
-
public boolean equals(Object o) {
if (!(o instanceof ParseData))
return false;
diff --git a/src/java/org/apache/nutch/parse/ParseResult.java b/src/java/org/apache/nutch/parse/ParseResult.java
index ef42692..1ea0abb 100644
--- a/src/java/org/apache/nutch/parse/ParseResult.java
+++ b/src/java/org/apache/nutch/parse/ParseResult.java
@@ -76,7 +76,7 @@ public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
/**
* Checks whether the result is empty.
*
- * @return
+ * @return true if empty, false otherwise
*/
public boolean isEmpty() {
return parseMap.isEmpty();
@@ -84,6 +84,7 @@ public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
/**
* Return the number of parse outputs (both successful and failed)
+ * @return an int representing the parse map size
*/
public int size() {
return parseMap.size();
@@ -166,6 +167,7 @@ public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
/**
* A convenience method which returns true only if all parses are successful.
* Parse success is determined by {@link ParseStatus#isSuccess()}.
+ * @return true if overall result is a success, false otherwise
*/
public boolean isSuccess() {
for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
@@ -180,6 +182,7 @@ public class ParseResult implements Iterable<Map.Entry<Text, Parse>> {
/**
* A convenience method which returns true if at least one of the parses is
* successful. Parse success is determined by {@link ParseStatus#isSuccess()}.
+ * @return true if atleast one result is a success, false otherwise
*/
public boolean isAnySuccess() {
for (Iterator<Entry<Text, Parse>> i = iterator(); i.hasNext();) {
diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
index 62551b2..f7c5797 100644
--- a/src/java/org/apache/nutch/parse/ParseSegment.java
+++ b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -169,9 +169,9 @@ public class ParseSegment extends NutchTool implements Tool {
/**
* Checks if the page's content is truncated.
*
- * @param content
+ * @param content the response {@link org.apache.nutch.protocol.Content}
* @return If the page is truncated <code>true</code>. When it is not, or when
- * it could be determined, <code>false</code>.
+ * it couldn't be determined, <code>false</code>.
*/
public static boolean isTruncated(Content content) {
byte[] contentBytes = content.getContent();
diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java
index f5fb487..feaef47 100644
--- a/src/java/org/apache/nutch/parse/ParseStatus.java
+++ b/src/java/org/apache/nutch/parse/ParseStatus.java
@@ -117,12 +117,25 @@ public class ParseStatus implements Writable {
this(majorCode, minorCode, (String[]) null);
}
- /** Simplified constructor for passing just a text message. */
+ /**
+ * Simplified constructor for passing just a text message.
+ * @param majorCode one of {@link #NOTPARSED}, {@link #SUCCESS}
+ * or {@link #FAILED}
+ * @param minorCode one of {@link #SUCCESS_REDIRECT}, {@link #FAILED_EXCEPTION},
+ * {@link #FAILED_TRUNCATED}, {@link #FAILED_INVALID_FORMAT},
+ * {@link #FAILED_MISSING_PARTS}, or {@link #FAILED_MISSING_CONTENT}
+ * @param message a message string to accompany the parse codes
+ */
public ParseStatus(int majorCode, int minorCode, String message) {
this(majorCode, minorCode, new String[] { message });
}
- /** Simplified constructor for passing just a text message. */
+ /**
+ * Simplified constructor for passing just a text message.
+ * @param majorCode one of {@link #NOTPARSED}, {@link #SUCCESS}
+ * or {@link #FAILED}
+ * @param message a message string to accompany the parse codes
+ */
public ParseStatus(int majorCode, String message) {
this(majorCode, 0, new String[] { message });
}
@@ -167,16 +180,15 @@ public class ParseStatus implements Writable {
}
/**
- * A convenience method. Returns true if majorCode is SUCCESS, false
+ * @return true if majorCode is SUCCESS, false
* otherwise.
*/
-
public boolean isSuccess() {
return majorCode == SUCCESS;
}
/**
- * A convenience method. Return a String representation of the first argument,
+ * @return a String representation of the first argument,
* or null.
*/
public String getMessage() {
@@ -198,16 +210,19 @@ public class ParseStatus implements Writable {
}
/**
- * A convenience method. Creates an empty Parse instance, which returns this
- * status.
+ * Creates an empty {@link Parse} instance containing the status
+ * @param conf a {@link Configuration}
+ * @return the empty {@link Parse} object
*/
public Parse getEmptyParse(Configuration conf) {
return new EmptyParseImpl(this, conf);
}
/**
- * A convenience method. Creates an empty ParseResult, which contains this
- * status.
+ * Creates an empty {@link ParseResult} for a given URL
+ * @param url canonical url
+ * @param conf a {@link Configuration}
+ * @return the empty {@link Parse} object
*/
public ParseResult getEmptyParseResult(String url, Configuration conf) {
return ParseResult.createParseResult(url, getEmptyParse(conf));
diff --git a/src/java/org/apache/nutch/parse/ParseUtil.java b/src/java/org/apache/nutch/parse/ParseUtil.java
index bc6d752..933ef83 100644
--- a/src/java/org/apache/nutch/parse/ParseUtil.java
+++ b/src/java/org/apache/nutch/parse/ParseUtil.java
@@ -46,8 +46,8 @@ public class ParseUtil {
private ExecutorService executorService;
/**
- *
- * @param conf
+ * Overloaded constructor
+ * @param conf a populated {@link org.apache.hadoop.conf.Configuration}
*/
public ParseUtil(Configuration conf) {
this.parserFactory = new ParserFactory(conf);
diff --git a/src/java/org/apache/nutch/parse/ParserFactory.java b/src/java/org/apache/nutch/parse/ParserFactory.java
index d02fed4..6c0b569 100644
--- a/src/java/org/apache/nutch/parse/ParserFactory.java
+++ b/src/java/org/apache/nutch/parse/ParserFactory.java
@@ -100,6 +100,8 @@ public final class ParserFactory {
* <code>plugin.includes</code>, then this ordered Array would consist
* of two {@link Parser} interfaces,
* <code>[parse-html, parse-rtf]</code>.
+ * @throws ParserNotFound if there is a runtime error locating a parser for the
+ * given content type and url
*/
public Parser[] getParsers(String contentType, String url)
throws ParserNotFound {
diff --git a/src/java/org/apache/nutch/plugin/Extension.java b/src/java/org/apache/nutch/plugin/Extension.java
index be737cb..ab65bb8 100644
--- a/src/java/org/apache/nutch/plugin/Extension.java
+++ b/src/java/org/apache/nutch/plugin/Extension.java
@@ -39,9 +39,13 @@ public class Extension {
* @param pDescriptor
* a plugin descriptor
* @param pExtensionPoint
- * an extension porin
+ * an extension point
* @param pId
* an unique id of the plugin
+ * @param pExtensionClass the class extending the extension point
+ * @param conf a populated {@link org.apache.hadoop.conf.Configuration}
+ * @param pluginRepository a {@link PluginRepository} containing
+ * all plugin artifacts
*/
public Extension(PluginDescriptor pDescriptor, String pExtensionPoint,
String pId, String pExtensionClass, Configuration conf,
@@ -127,7 +131,8 @@ public class Extension {
}
/**
- * Returns the Id of the extension point, that is implemented by this
+ * Get target point
+ * @return the Id of the extension point, that is implemented by this
* extension.
*/
public String getTargetPoint() {
@@ -144,6 +149,7 @@ public class Extension {
* plugins.
*
* @return Object An instance of the extension implementation
+ * @throws PluginRuntimeException if there is a fatal runtime error
*/
public Object getExtensionInstance() throws PluginRuntimeException {
// Must synchronize here to make sure creation and initialization
@@ -180,7 +186,7 @@ public class Extension {
}
/**
- * return the plugin descriptor.
+ * Get the plugin descriptor.
*
* @return PluginDescriptor
*/
@@ -192,7 +198,7 @@ public class Extension {
* Sets the plugin descriptor and is only used until model creation at system
* start up.
*
- * @param pDescriptor
+ * @param pDescriptor a instantiated {@link org.apache.nutch.plugin.PluginDescriptor}
*/
public void setDescriptor(PluginDescriptor pDescriptor) {
fDescriptor = pDescriptor;
diff --git a/src/java/org/apache/nutch/plugin/ExtensionPoint.java b/src/java/org/apache/nutch/plugin/ExtensionPoint.java
index 61768ed..5842204 100644
--- a/src/java/org/apache/nutch/plugin/ExtensionPoint.java
+++ b/src/java/org/apache/nutch/plugin/ExtensionPoint.java
@@ -105,7 +105,8 @@ public class ExtensionPoint {
/**
* Install a coresponding extension to this extension point.
*
- * @param extension
+ * @param extension the new {@link org.apache.nutch.plugin.Extension}
+ * to install
*/
public void addExtension(Extension extension) {
fExtensions.add(extension);
diff --git a/src/java/org/apache/nutch/plugin/Plugin.java b/src/java/org/apache/nutch/plugin/Plugin.java
index 8325a56..314a866 100644
--- a/src/java/org/apache/nutch/plugin/Plugin.java
+++ b/src/java/org/apache/nutch/plugin/Plugin.java
@@ -43,8 +43,9 @@ public class Plugin {
protected Configuration conf;
/**
- * Constructor
- *
+ * Overloaded constructor
+ * @param pDescriptor a plugin descriptor
+ * @param conf a populated {@link org.apache.hadoop.conf.Configuration}
*/
public Plugin(PluginDescriptor pDescriptor, Configuration conf) {
setDescriptor(pDescriptor);
diff --git a/src/java/org/apache/nutch/plugin/PluginClassLoader.java b/src/java/org/apache/nutch/plugin/PluginClassLoader.java
index a0741b8..02422f7 100644
--- a/src/java/org/apache/nutch/plugin/PluginClassLoader.java
+++ b/src/java/org/apache/nutch/plugin/PluginClassLoader.java
@@ -43,12 +43,13 @@ public class PluginClassLoader extends URLClassLoader {
private ClassLoader system = getSystemClassLoader();
/**
- * Construtor
+ * Overloaded constructor
*
* @param urls
* Array of urls with own libraries and all exported libraries of
- * plugins that are required to this plugin
- * @param parent
+ * plugins that are required for this plugin
+ * @param parent the parent {@link java.lang.ClassLoader} responsible for
+ * loading classes defined in urls.
*/
public PluginClassLoader(URL[] urls, ClassLoader parent) {
super(urls, parent);
diff --git a/src/java/org/apache/nutch/plugin/PluginDescriptor.java b/src/java/org/apache/nutch/plugin/PluginDescriptor.java
index de6c0ba..b22409a 100644
--- a/src/java/org/apache/nutch/plugin/PluginDescriptor.java
+++ b/src/java/org/apache/nutch/plugin/PluginDescriptor.java
@@ -57,14 +57,15 @@ public class PluginDescriptor {
private Configuration fConf;
/**
- * Constructor
+ * Overloaded constructor
*
- * @param pId
- * @param pVersion
- * @param pName
- * @param pProviderName
- * @param pPluginclazz
- * @param pPath
+ * @param pId set plugin ID
+ * @param pVersion set plugin version
+ * @param pName set plugin name
+ * @param pProviderName set plugin provider name
+ * @param pPluginclazz set plugin Class
+ * @param pPath set plugin path
+ * @param conf a populated {@link org.apache.hadoop.conf.Configuration}
*/
public PluginDescriptor(String pId, String pVersion, String pName,
String pProviderName, String pPluginclazz, String pPath,
@@ -82,7 +83,8 @@ public class PluginDescriptor {
}
/**
- * @param pPath
+ * Set the plugin path
+ * @param pPath plugin path
*/
private void setPath(String pPath) {
fPluginPath = pPath;
@@ -149,7 +151,7 @@ public class PluginDescriptor {
/**
* Adds a extension.
*
- * @param pExtension
+ * @param pExtension a {@link org.apache.nutch.plugin.Extension}
*/
public void addExtension(Extension pExtension) {
fExtensions.add(pExtension);
@@ -178,7 +180,7 @@ public class PluginDescriptor {
/**
* Adds a extension point.
*
- * @param extensionPoint
+ * @param extensionPoint a {@link org.apache.nutch.plugin.ExtensionPoint}
*/
public void addExtensionPoint(ExtensionPoint extensionPoint) {
fExtensionPoints.add(extensionPoint);
@@ -216,11 +218,15 @@ public class PluginDescriptor {
/**
* Adds a exported library with a relative path to the plugin directory. We
* automatically escape characters that are illegal in URLs. It is recommended
- * that code converts an abstract pathname into a URL by first converting it
- * into a URI, via the toURI method, and then converting the URI into a URL
- * via the URI.toURL method.
+ * that code converts an abstract pathname into a {@link java.net.URL} by
+ * first converting it into a {@link java.net.URI}, via the
+ * {@link java.net.URL#toURI()} method, and then converting the
+ * {@link java.net.URI} into a {@link java.net.URL} via the
+ * {@link java.net.URI#toURL()} method.
*
- * @param pLibPath
+ * @param pLibPath path to a exported library relative to the plugin directory
+ * @throws MalformedURLException if the pLibPath is not a relative path
+ * (to the plugin directory)
*/
public void addExportedLibRelative(String pLibPath)
throws MalformedURLException {
@@ -239,7 +245,7 @@ public class PluginDescriptor {
}
/**
- * Returns a array exported librareis as URLs
+ * Returns a array of exported libs as URLs
*
* @return URL[]
*/
@@ -248,13 +254,17 @@ public class PluginDescriptor {
}
/**
- * Adds a exported library with a relative path to the plugin directory. We
+ * Adds a non-exported library with a relative path to the plugin directory. We
* automatically escape characters that are illegal in URLs. It is recommended
- * that code converts an abstract pathname into a URL by first converting it
- * into a URI, via the toURI method, and then converting the URI into a URL
- * via the URI.toURL method.
+ * that code converts an abstract pathname into a {@link java.net.URL} by
+ * first converting it into a {@link java.net.URI}, via the
+ * {@link java.net.URL#toURI()} method, and then converting the
+ * {@link java.net.URI} into a {@link java.net.URL} via the
+ * {@link java.net.URI#toURL()} method.
*
- * @param pLibPath
+ * @param pLibPath path to a exported library relative to the plugin directory
+ * @throws MalformedURLException if the pLibPath is not a relative path
+ * (to the plugin directory)
*/
public void addNotExportedLibRelative(String pLibPath)
throws MalformedURLException {
@@ -328,12 +338,13 @@ public class PluginDescriptor {
/**
* Returns a I18N'd resource string. The resource bundles could be stored in
- * root directory of a plugin in the well know i18n file name conventions.
+ * root directory of a plugin in the well known i18n file name conventions.
*
- * @param pKey
- * @param pLocale
- * @return String
- * @throws IOException
+ * @param pKey a plugin key
+ * @param pLocale the required {@link java.util.Locale}
+ * @return a string for the given key from the
+ * {@link java.util.ResourceBundle} bundle or one of its parents
+ * @throws IOException if there is an error obtaining the key
*/
public String getResourceString(String pKey, Locale pLocale)
throws IOException {
diff --git a/src/java/org/apache/nutch/plugin/PluginManifestParser.java b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
index 309c2a4..d7280ad 100644
--- a/src/java/org/apache/nutch/plugin/PluginManifestParser.java
+++ b/src/java/org/apache/nutch/plugin/PluginManifestParser.java
@@ -105,6 +105,8 @@ public class PluginManifestParser {
/**
* Return the named plugin folder. If the name is absolute then it is
* returned. Otherwise, for relative names, the classpath is scanned.
+ * @param name the name of a plugin folder
+ * @return the plugin directory as a {@link File}
*/
public File getPluginFolder(String name) {
File directory = new File(name);
diff --git a/src/java/org/apache/nutch/plugin/PluginRepository.java b/src/java/org/apache/nutch/plugin/PluginRepository.java
index 50daa57..44df3a2 100644
--- a/src/java/org/apache/nutch/plugin/PluginRepository.java
+++ b/src/java/org/apache/nutch/plugin/PluginRepository.java
@@ -37,8 +37,8 @@ import org.apache.nutch.util.ObjectCache;
/**
* The plugin repositority is a registry of all plugins.
*
- * At system boot up a repositority is builded by parsing the mainifest files of
- * all plugins. Plugins that require not existing other plugins are not
+ * At system boot up a repositority is built by parsing the mainifest files of
+ * all plugins. Plugins that require other plugins which do not exist are not
* registed. For each plugin a plugin descriptor instance will be created. The
* descriptor represents all meta information about a plugin. So a plugin
* instance will be created later when it is required, this allow lazy plugin
@@ -63,8 +63,8 @@ public class PluginRepository {
.getLogger(MethodHandles.lookup().lookupClass());
/**
- * @throws RuntimeException
- * @see java.lang.Object#Object()
+ * @param conf a populated {@link Configuration}
+ * @throws RuntimeException if a fatal runtime error is encountered
*/
public PluginRepository(Configuration conf) throws RuntimeException {
fActivatedPlugins = new HashMap<>();
@@ -97,6 +97,8 @@ public class PluginRepository {
}
/**
+ * Get a cached instance of the {@link org.apache.nutch.plugin.PluginRepository}
+ * @param conf a populated {@link Configuration}
* @return a cached instance of the plugin repository
*/
public static synchronized PluginRepository get(Configuration conf) {
@@ -230,7 +232,7 @@ public class PluginRepository {
/**
* Returns the descriptor of one plugin identified by a plugin id.
*
- * @param pPluginId
+ * @param pPluginId a pluginId for which the descriptor will be retrieved
* @return PluginDescriptor
*/
public PluginDescriptor getPluginDescriptor(String pPluginId) {
@@ -243,9 +245,9 @@ public class PluginRepository {
}
/**
- * Returns a extension point indentified by a extension point id.
+ * Returns a extension point identified by a extension point id.
*
- * @param pXpId
+ * @param pXpId an extension point id
* @return a extentsion point
*/
public ExtensionPoint getExtensionPoint(String pXpId) {
@@ -262,9 +264,10 @@ public class PluginRepository {
* extensions of the same plugin. This class loader use all exported libraries
* from the dependend plugins and all plugin libraries.
*
- * @param pDescriptor
- * @return Plugin
- * @throws PluginRuntimeException
+ * @param pDescriptor a {@link PluginDescriptor} for which to retrieve a
+ * {@link Plugin} instance
+ * @return a {@link Plugin} instance
+ * @throws PluginRuntimeException if there is a fatal runtime plugin error
*/
public Plugin getPluginInstance(PluginDescriptor pDescriptor)
throws PluginRuntimeException {
@@ -483,7 +486,7 @@ public class PluginRepository {
* plugin ID (needs to be activated in the configuration), and the
* class name. The rest of arguments is passed to the main method of
* the selected class.
- * @throws Exception
+ * @throws Exception if there is an error running this Class
*/
public static void main(String[] args) throws Exception {
if (args.length < 2) {
diff --git a/src/java/org/apache/nutch/protocol/Content.java b/src/java/org/apache/nutch/protocol/Content.java
index e7016f0..883e0fb 100644
--- a/src/java/org/apache/nutch/protocol/Content.java
+++ b/src/java/org/apache/nutch/protocol/Content.java
@@ -196,11 +196,10 @@ public final class Content implements Writable {
return content;
}
- //
- // Accessor methods
- //
-
- /** The url fetched. */
+ /**
+ * The url fetched.
+ * @return the fetched url
+ */
public String getUrl() {
return url;
}
@@ -208,12 +207,16 @@ public final class Content implements Writable {
/**
* The base url for relative links contained in the content. Maybe be
* different from url if the request redirected.
+ * @return the base url
*/
public String getBaseUrl() {
return base;
}
- /** The binary content retrieved. */
+ /**
+ * The binary content retrieved.
+ * @return content as a byte[]
+ */
public byte[] getContent() {
return content;
}
@@ -227,6 +230,7 @@ public final class Content implements Writable {
*
* @see <a href="http://www.iana.org/assignments/media-types/">
* http://www.iana.org/assignments/media-types/</a>
+ * @return content type
*/
public String getContentType() {
return contentType;
@@ -236,12 +240,18 @@ public final class Content implements Writable {
this.contentType = contentType;
}
- /** Other protocol-specific data. */
+ /**
+ * Other protocol-specific data.
+ * @return additional {@link org.apache.nutch.metadata.Metadata}
+ */
public Metadata getMetadata() {
return metadata;
}
- /** Other protocol-specific data. */
+ /**
+ * Other protocol-specific data.
+ * @param metadata a populated {@link Metadata} object to set
+ */
public void setMetadata(Metadata metadata) {
this.metadata = metadata;
}
diff --git a/src/java/org/apache/nutch/protocol/Protocol.java b/src/java/org/apache/nutch/protocol/Protocol.java
index 2287487..ab4162c 100644
--- a/src/java/org/apache/nutch/protocol/Protocol.java
+++ b/src/java/org/apache/nutch/protocol/Protocol.java
@@ -32,7 +32,10 @@ public interface Protocol extends Pluggable, Configurable {
public final static String X_POINT_ID = Protocol.class.getName();
/**
- * Returns the {@link Content} for a fetchlist entry.
+ * Get the {@link ProtocolOutput} for a given url and crawldatum
+ * @param url canonical url
+ * @param datum associated {@link org.apache.nutch.crawl.CrawlDatum}
+ * @return the {@link ProtocolOutput}
*/
ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum);
diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
index 2cb52a6..eb3ba46 100644
--- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java
+++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java
@@ -160,6 +160,8 @@ public abstract class RobotRulesParser implements Tool {
/**
* Check whether a URL belongs to a whitelisted host.
+ * @param url a {@link java.net.URL} to check against rules
+ * @return true if allowed, false otherwise
*/
public boolean isWhiteListed(URL url) {
boolean match = false;
diff --git a/src/java/org/apache/nutch/publisher/NutchPublisher.java b/src/java/org/apache/nutch/publisher/NutchPublisher.java
index 75281ad..15fdacb 100644
--- a/src/java/org/apache/nutch/publisher/NutchPublisher.java
+++ b/src/java/org/apache/nutch/publisher/NutchPublisher.java
@@ -31,14 +31,16 @@ public interface NutchPublisher extends Configurable, Pluggable {
/**
* Use implementation specific configurations
- * @param conf {@link org.apache.hadoop.conf.Configuration Configuration} to be used
+ * @param conf {@link org.apache.hadoop.conf.Configuration} to be used
+ * @return true if set, false otherwise
*/
public boolean setConfig(Configuration conf);
/**
* This method publishes the event. Make sure that the event is a Java POJO to avoid
* Jackson JSON conversion errors. Currently we use the FetcherThreadEvent
- * @param event
+ * @param event the {@link java.lang.Object} (event) to publish
+ * @param conf {@link org.apache.hadoop.conf.Configuration} to be used
*/
public void publish(Object event, Configuration conf);
diff --git a/src/java/org/apache/nutch/scoring/ScoringFilter.java b/src/java/org/apache/nutch/scoring/ScoringFilter.java
index bc74fcb..aeda94c 100644
--- a/src/java/org/apache/nutch/scoring/ScoringFilter.java
+++ b/src/java/org/apache/nutch/scoring/ScoringFilter.java
@@ -52,7 +52,8 @@ public interface ScoringFilter extends Configurable, Pluggable {
* url of the page
* @param datum
* new datum. Filters will modify it in-place.
- * @throws ScoringFilterException
+ * @throws ScoringFilterException if there is a fatal error
+ * setting an initial score for newly injected pages
*/
public void injectedScore(Text url, CrawlDatum datum)
throws ScoringFilterException;
@@ -68,7 +69,8 @@ public interface ScoringFilter extends Configurable, Pluggable {
* url of the page
* @param datum
* new datum. Filters will modify it in-place.
- * @throws ScoringFilterException
+ * @throws ScoringFilterException if there is a fatal error
+ * setting an initial score for newly discovered pages
*/
public void initialScore(Text url, CrawlDatum datum)
throws ScoringFilterException;
@@ -83,6 +85,10 @@ public interface ScoringFilter extends Configurable, Pluggable {
* page's datum, should not be modified
* @param initSort
* initial sort value, or a value from previous filters in chain
+ * @return a sort value for use in sorting and selecting the
+ * top N scoring pages during fetchlist generation
+ * @throws ScoringFilterException if there is a fatal error
+ * preparing the sort value
*/
public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
throws ScoringFilterException;
@@ -101,6 +107,9 @@ public interface ScoringFilter extends Configurable, Pluggable {
* @param content
* instance of content. Implementations may modify this in-place,
* primarily by setting some metadata properties.
+ * @throws ScoringFilterException if there is a fatal error
+ * injecting score information from the current datum into
+ * {@link org.apache.nutch.protocol.Content} metadata
*/
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
throws ScoringFilterException;
@@ -119,6 +128,8 @@ public interface ScoringFilter extends Configurable, Pluggable {
* target instance to copy the score information to. Implementations
* may modify this in-place, primarily by setting some metadata
* properties.
+ * @throws ScoringFilterException if there is a fatal error
+ * processing score data in subsequent steps after parsing
*/
public void passScoreAfterParsing(Text url, Content content, Parse parse)
throws ScoringFilterException;
@@ -146,7 +157,8 @@ public interface ScoringFilter extends Configurable, Pluggable {
* with status {@link CrawlDatum#STATUS_LINKED}, which contains
* adjustments to be applied to the original CrawlDatum score(s) and
* metadata. This can be null if not needed.
- * @throws ScoringFilterException
+ * @throws ScoringFilterException there is a fatal error distributing
+ * score data from the current page to all of its outlinks
*/
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
@@ -173,7 +185,8 @@ public interface ScoringFilter extends Configurable, Pluggable {
* @param inlinked
* (partial) list of CrawlDatum-s (with their scores) from links
* pointing to this page, found in the current update batch.
- * @throws ScoringFilterException
+ * @throws ScoringFilterException there is a fatal error calculating
+ * a new score of {@link CrawlDatum} during CrawlDb update
*/
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
List<CrawlDatum> inlinked) throws ScoringFilterException;
@@ -186,7 +199,10 @@ public interface ScoringFilter extends Configurable, Pluggable {
* URL of the page
* @param datum
* CrawlDatum for page
- * @throws ScoringFilterException
+ * @throws ScoringFilterException if there is a fatal error whilst
+ * changing the score or status of {@link CrawlDatum} during
+ * {@link org.apache.nutch.crawl.CrawlDb} update, when the URL is
+ * neither fetched nor has any inlinks
*/
public default void orphanedScore(Text url, CrawlDatum datum)
throws ScoringFilterException {
@@ -224,7 +240,8 @@ public interface ScoringFilter extends Configurable, Pluggable {
* argument to the next scoring filter in chain. NOTE: implementations
* may also express other scoring strategies by modifying the indexed
* document directly.
- * @throws ScoringFilterException
+ * @throws ScoringFilterException if there is a fatal error whilst calculating
+ * the indexed document score/boost
*/
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
index 3c74b38..733edbc 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
@@ -313,6 +313,12 @@ public class LinkDumper extends Configured implements Tool {
/**
* Runs the inverter and merger jobs of the LinkDumper tool to create the url
* to inlink node database.
+ * @param webGraphDb the {@link org.apache.hadoop.fs.Path} to the output
+ * of {@link org.apache.nutch.scoring.webgraph.WebGraph#createWebGraph(Path, Path[], boolean, boolean)}
+ * @throws IOException if there is a fatal I/O issue at runtime
+ * @throws InterruptedException if the Job is interrupted during execution
+ * @throws ClassNotFoundException if classes required to run
+ * the Job cannot be located
*/
public void dumpLinks(Path webGraphDb) throws IOException,
InterruptedException, ClassNotFoundException {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
index 9720754..39a9c63 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java
@@ -633,6 +633,7 @@ public class LinkRank extends Configured implements Tool {
/**
* Configurable constructor.
+ * @param conf a populated {@link org.apache.hadoop.conf.Configuration}
*/
public LinkRank(Configuration conf) {
super(conf);
@@ -648,7 +649,10 @@ public class LinkRank extends Configured implements Tool {
* The WebGraph to run link analysis on.
*
* @throws IOException
- * If an error occurs during link analysis.
+ * If a fatal I/O runtime error occurs during link analysis.
+ * @throws InterruptedException if the Job is interrupted during execution
+ * @throws ClassNotFoundException if classes required to run
+ * the Job cannot be located
*/
public void analyze(Path webGraphDb) throws IOException,
ClassNotFoundException, InterruptedException {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
index 70c4270..fc2875e 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java
@@ -258,8 +258,8 @@ public class NodeDumper extends Configured implements Tool {
@Override
public void setup(Reducer<Text, FloatWritable, Text, FloatWritable>.Context context) {
conf = context.getConfiguration();
- topn = conf.getLong("topn", Long.MAX_VALUE);
- sum = conf.getBoolean("sum", false);
+ topn = conf.getLong("topn", Long.MAX_VALUE);
+ sum = conf.getBoolean("sum", false);
}
}
@@ -269,12 +269,24 @@ public class NodeDumper extends Configured implements Tool {
* Runs the process to dump the top urls out to a text file.
*
* @param webGraphDb
- * The WebGraph from which to pull values.
- *
- * @param topN
- * @param output
- *
- * @throws IOException
+ * The {@link org.apache.nutch.scoring.webgraph.WebGraph}
+ * from which to pull values.
+ * @param type the node property type to dump, one of
+ * {@link NodeDumper.DumpType#INLINKS}, {@link NodeDumper.DumpType#OUTLINKS}
+ * or {@link NodeDumper.DumpType#SCORES}
+ * @param topN maximum value of top links to dump
+ * @param output a {@link org.apache.hadoop.fs.Path} to write output to
+ * @param asEff if true set equals-sign as separator for
+ * <a href="https://lucene.apache.org/solr/api/org/apache/solr/schema/ExternalFileField.html">
+ * Solr's ExternalFileField</a>, false otherwise
+ * @param nameType either {@link NodeDumper.NameType#HOST} or
+ * {@link NodeDumper.NameType#DOMAIN}
+ * @param aggrType the aggregation type, either
+ * {@link NodeDumper.AggrType#MAX} or {@link NodeDumper.AggrType#SUM}
+ * @param asSequenceFile true output will be written as
+ * {@link SequenceFileOutputFormat}, otherwise default
+ * {@link TextOutputFormat}
+ * @throws Exception
* If an error occurs while dumping the top values.
*/
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output,
diff --git a/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java b/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
index b90cfe5..d6fd9d0 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java
@@ -84,6 +84,9 @@ public class NodeReader extends Configured {
* Runs the NodeReader tool. The command line arguments must contain a
* webgraphdb path and a url. The url must match the normalized url that is
* contained in the NodeDb of the WebGraph.
+ * @param args input parameters for running this tool, run with
+ * "help" for information
+ * @throws Exception if there is a fatal error running this tool
*/
public static void main(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
index 3674fa8..6cc604f 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java
@@ -147,9 +147,11 @@ public class ScoreUpdater extends Configured implements Tool{
* The crawl database to update
* @param webGraphDb
* The webgraph database to use.
- *
* @throws IOException
* If an error occurs while updating the scores.
+ * @throws InterruptedException if the Job is interrupted during execution
+ * @throws ClassNotFoundException if classes required to run
+ * the Job cannot be located
*/
public void update(Path crawlDb, Path webGraphDb) throws IOException,
ClassNotFoundException, InterruptedException {
diff --git a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
index 0b53a39..5b7a3fd 100644
--- a/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
+++ b/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java
@@ -141,6 +141,7 @@ public class WebGraph extends Configured implements Tool {
/**
* Configurable constructor.
+ * @param conf a populated {@link Configuration}
*/
public OutlinkDb(Configuration conf) {
setConf(conf);
@@ -509,6 +510,9 @@ public class WebGraph extends Configured implements Tool {
*
* @throws IOException
* If an error occurs while processing the WebGraph.
+ * @throws InterruptedException if the Job is interrupted during execution
+ * @throws ClassNotFoundException if classes required to run
+ * the Job cannot be located
*/
public void createWebGraph(Path webGraphDb, Path[] segments,
boolean normalize, boolean filter) throws IOException,
diff --git a/src/java/org/apache/nutch/segment/SegmentChecker.java b/src/java/org/apache/nutch/segment/SegmentChecker.java
index 991fe72..cf230c2 100644
--- a/src/java/org/apache/nutch/segment/SegmentChecker.java
+++ b/src/java/org/apache/nutch/segment/SegmentChecker.java
@@ -41,6 +41,14 @@ public class SegmentChecker {
/**
* Check if the segment is indexable. May add new check methods here.
+ * @param segmentPath path to an individual segment on disk
+ * @param fs the {@link org.apache.hadoop.fs.FileSystem} that the
+ * segment resides on
+ * @return true if the checks pass and the segment can be indexed,
+ * false otherwise
+ * @throws IOException if there is an I/O error locating or
+ * checking either the segment contents or locating it on
+ * the filesystem
*/
public static boolean isIndexable(Path segmentPath, FileSystem fs)
throws IOException {
@@ -62,6 +70,13 @@ public class SegmentChecker {
/**
* Check the segment to see if it is valid based on the sub directories.
+ * @param segmentPath path to an individual segment on disk
+ * @param fs the {@link org.apache.hadoop.fs.FileSystem} that the
+ * segment resides on
+ * @return true if the checks pass false otherwise
+ * @throws IOException if there is an I/O error locating or
+ * checking either the segment contents or locating it on
+ * the filesystem
*/
public static boolean checkSegmentDir(Path segmentPath, FileSystem fs)
throws IOException {
@@ -124,6 +139,14 @@ public class SegmentChecker {
/**
* Check the segment to see if it is has been parsed before.
+ * @param segment path to an individual segment on disk
+ * @param fs the {@link org.apache.hadoop.fs.FileSystem} that the
+ * segment resides on
+ * @return true if the checks pass and the segment has been parsed,
+ * false otherwise
+ * @throws IOException if there is an I/O error locating or
+ * checking either the segment contents or locating it on
+ * the filesystem
*/
public static boolean isParsed(Path segment, FileSystem fs)
throws IOException {
diff --git a/src/java/org/apache/nutch/segment/SegmentMergeFilter.java b/src/java/org/apache/nutch/segment/SegmentMergeFilter.java
index 6d53809..0e1d579 100644
--- a/src/java/org/apache/nutch/segment/SegmentMergeFilter.java
+++ b/src/java/org/apache/nutch/segment/SegmentMergeFilter.java
@@ -37,7 +37,14 @@ public interface SegmentMergeFilter {
/**
* The filtering method which gets all information being merged for a given
* key (URL).
- *
+ * @param key the segment record key
+ * @param generateData directory and data produced by the generation phase
+ * @param fetchData directory and data produced by the fetch phase
+ * @param sigData directory and data produced by the parse phase
+ * @param content directory and data produced by the parse phase
+ * @param parseData directory and data produced by the parse phase
+ * @param parseText directory and data produced by the parse phase
+ * @param linked all LINKED values from the latest segment
* @return <tt>true</tt> values for this <tt>key</tt> (URL) should be merged
* into the new segment.
*/
diff --git a/src/java/org/apache/nutch/segment/SegmentMergeFilters.java b/src/java/org/apache/nutch/segment/SegmentMergeFilters.java
index 78b9f76..81cf323 100644
--- a/src/java/org/apache/nutch/segment/SegmentMergeFilters.java
+++ b/src/java/org/apache/nutch/segment/SegmentMergeFilters.java
@@ -63,7 +63,14 @@ public class SegmentMergeFilters {
/**
* Iterates over all {@link SegmentMergeFilter} extensions and if any of them
* returns false, it will return false as well.
- *
+ * @param key the segment record key
+ * @param generateData directory and data produced by the generation phase
+ * @param fetchData directory and data produced by the fetch phase
+ * @param sigData directory and data produced by the parse phase
+ * @param content directory and data produced by the parse phase
+ * @param parseData directory and data produced by the parse phase
+ * @param parseText directory and data produced by the parse phase
+ * @param linked all LINKED values from the latest segment
* @return <tt>true</tt> values for this <tt>key</tt> (URL) should be merged
* into the new segment.
*/
diff --git a/src/java/org/apache/nutch/segment/SegmentMerger.java b/src/java/org/apache/nutch/segment/SegmentMerger.java
index 7dbfd11..2270647 100644
--- a/src/java/org/apache/nutch/segment/SegmentMerger.java
+++ b/src/java/org/apache/nutch/segment/SegmentMerger.java
@@ -94,7 +94,6 @@ import org.apache.nutch.util.NutchJob;
* {@link org.apache.nutch.crawl.Generator} doesn't ensure that fetchlist parts
* for each map task are disjoint.
* </p>
- * <p>
* <h4>Duplicate content</h4>
* Merging segments removes older content whenever possible (see below).
* However, this is NOT the same as de-duplication, which in addition removes
@@ -109,8 +108,8 @@ import org.apache.nutch.util.NutchJob;
* segments be named in an increasing lexicographic order as their creation time
* increases.
* </p>
- * <p>
* <h4>Merging and indexes</h4>
+ * <p>
* Merged segment gets a different name. Since Indexer embeds segment names in
* indexes, any indexes originally created for the input segments will NOT work
* with the merged segment. Newly created merged segment(s) need to be indexed
@@ -746,7 +745,8 @@ public class SegmentMerger extends Configured implements Tool{
}
/**
- * @param args
+ * Run this tool
+ * @param args input arguments for the tool, running with no argument provides args information
*/
@Override
public int run(String[] args) throws Exception {
diff --git a/src/java/org/apache/nutch/segment/SegmentPart.java b/src/java/org/apache/nutch/segment/SegmentPart.java
index 9433066..6769149 100644
--- a/src/java/org/apache/nutch/segment/SegmentPart.java
+++ b/src/java/org/apache/nutch/segment/SegmentPart.java
@@ -51,9 +51,9 @@ public class SegmentPart {
/**
* Create SegmentPart from a FileSplit.
*
- * @param split
- * @return A {@link SegmentPart} resultant from a {@link FileSplit}.
- * @throws IOException
+ * @param split a {@link FileSplit} segment part
+ * @return a {@link SegmentPart} resultant from a {@link FileSplit}.
+ * @throws IOException if there is a fatal error locating the split
*/
public static SegmentPart get(FileSplit split) throws IOException {
return get(split.getPath().toString());
diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java
index f47a76d..284daed 100644
--- a/src/java/org/apache/nutch/segment/SegmentReader.java
+++ b/src/java/org/apache/nutch/segment/SegmentReader.java
@@ -461,7 +461,14 @@ public class SegmentReader extends Configured implements Tool {
return res;
}
- /** Try to get HTML encoding from parse metadata */
+ /**
+ * Try to get HTML encoding from parse metadata. Try
+ * {@link Metadata#CHAR_ENCODING_FOR_CONVERSION}, then
+ * {@link Metadata#CONTENT_ENCODING} then fallback
+ * {@link java.nio.charset.StandardCharsets#UTF_8}
+ * @param parseMeta a populated {@link Metadata}
+ * @return {@link Charset}
+ */
public static Charset getCharset(Metadata parseMeta) {
Charset cs = StandardCharsets.UTF_8;
String charset = parseMeta.get(Metadata.CHAR_ENCODING_FOR_CONVERSION);
diff --git a/src/java/org/apache/nutch/service/JobManager.java b/src/java/org/apache/nutch/service/JobManager.java
index ad734cd..35a56e1 100644
--- a/src/java/org/apache/nutch/service/JobManager.java
+++ b/src/java/org/apache/nutch/service/JobManager.java
@@ -32,7 +32,7 @@ public interface JobManager {
/**
* Creates specified job
- * @param jobConfig
+ * @param jobConfig a job-specific {@link JobConfig}
* @return JobInfo
*/
public JobInfo create(JobConfig jobConfig);
diff --git a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
index 34c07d3..784d098 100644
--- a/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
+++ b/src/java/org/apache/nutch/service/impl/ConfManagerImpl.java
@@ -86,7 +86,7 @@ public class ConfManagerImpl implements ConfManager {
/**
* Created a new configuration based on the values provided.
- * @param nutchConfig
+ * @param nutchConfig crawler configuration
* @return String - confId
*/
public String create(NutchConfig nutchConfig) {
diff --git a/src/java/org/apache/nutch/service/impl/JobWorker.java b/src/java/org/apache/nutch/service/impl/JobWorker.java
index 8ee9344..d3343ae 100644
--- a/src/java/org/apache/nutch/service/impl/JobWorker.java
+++ b/src/java/org/apache/nutch/service/impl/JobWorker.java
@@ -39,9 +39,10 @@ public class JobWorker implements Runnable{
/**
* To initialize JobWorker thread with the Job Configurations provided by user.
- * @param jobConfig
- * @param conf
- * @param tool - NutchTool to run
+ * @param jobConfig job-specific {@link JobConfig}
+ * @param conf a populated {@link Configuration}
+ * @param tool {!{@link NutchTool} to run
+ * return JobWorker
*/
public JobWorker(JobConfig jobConfig, Configuration conf, NutchTool tool) {
this.jobConfig = jobConfig;
diff --git a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
index f533cd1..529e190 100644
--- a/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
+++ b/src/java/org/apache/nutch/service/impl/NutchServerPoolExecutor.java
@@ -48,7 +48,7 @@ public class NutchServerPoolExecutor extends ThreadPoolExecutor{
runningWorkers.offer(((JobWorker) runnable));
}
}
- @SuppressWarnings("unlikely-arg-type")
+
@Override
protected void afterExecute(Runnable runnable, Throwable throwable) {
super.afterExecute(runnable, throwable);
@@ -69,9 +69,9 @@ public class NutchServerPoolExecutor extends ThreadPoolExecutor{
}
/**
- * Find the Job Worker Thread
- * @param jobId
- * @return
+ * Find the Job Worker Thread.
+ * @param jobId a jobId allows locating a specific worker thread
+ * @return a {@link JobWorker} or else null
*/
public JobWorker findWorker(String jobId) {
synchronized (runningWorkers) {
@@ -85,24 +85,24 @@ public class NutchServerPoolExecutor extends ThreadPoolExecutor{
}
/**
- * Gives the Job history
- * @return
+ * Get the Job history
+ * @return a {@link Collection} of {@link JobInfo}'s
*/
public Collection<JobInfo> getJobHistory() {
return getJobsInfo(workersHistory);
}
/**
- * Gives the list of currently running jobs
- * @return
+ * Get the list of currently running jobs
+ * @return a {@link Collection} of {@link JobInfo}'s
*/
public Collection<JobInfo> getJobRunning() {
return getJobsInfo(runningWorkers);
}
/**
- * Gives all jobs(currently running and completed)
- * @return
+ * get all jobs (currently running and completed)
+ * @return a {@link Collection} of {@link JobInfo}'s
*/
@SuppressWarnings("unchecked")
public Collection<JobInfo> getAllJobs() {
diff --git a/src/java/org/apache/nutch/service/model/request/JobConfig.java b/src/java/org/apache/nutch/service/model/request/JobConfig.java
index 76a43e0..ab80517 100644
--- a/src/java/org/apache/nutch/service/model/request/JobConfig.java
+++ b/src/java/org/apache/nutch/service/model/request/JobConfig.java
@@ -20,6 +20,10 @@ import java.util.Map;
import org.apache.nutch.service.JobManager.JobType;
+/**
+ * Job-specific configuration.
+ *
+ */
public class JobConfig {
private String crawlId;
private JobType type;
diff --git a/src/java/org/apache/nutch/service/model/response/JobInfo.java b/src/java/org/apache/nutch/service/model/response/JobInfo.java
index e952126..807c2d5 100644
--- a/src/java/org/apache/nutch/service/model/response/JobInfo.java
+++ b/src/java/org/apache/nutch/service/model/response/JobInfo.java
@@ -23,8 +23,6 @@ import org.apache.nutch.service.model.request.JobConfig;
/**
* This is the response object containing Job information
- *
- *
*/
public class JobInfo {
diff --git a/src/java/org/apache/nutch/service/resources/AdminResource.java b/src/java/org/apache/nutch/service/resources/AdminResource.java
index 8e1b4af..cc03d33 100644
--- a/src/java/org/apache/nutch/service/resources/AdminResource.java
+++ b/src/java/org/apache/nutch/service/resources/AdminResource.java
@@ -36,8 +36,8 @@ public class AdminResource extends AbstractResource{
.getLogger(MethodHandles.lookup().lookupClass());
/**
- * To get the status of the Nutch Server
- * @return
+ * Get the status of the Nutch Server
+ * @return {@link NutchServerInfo} for the running service
*/
@GET
@Path(value="/")
@@ -53,7 +53,7 @@ public class AdminResource extends AbstractResource{
/**
* Stop the Nutch server
* @param force If set to true, it will kill any running jobs
- * @return
+ * @return a message indicating shutdown status
*/
@GET
@Path(value="/stop")
diff --git a/src/java/org/apache/nutch/service/resources/ConfigResource.java b/src/java/org/apache/nutch/service/resources/ConfigResource.java
index c6372ee..38e14dc 100644
--- a/src/java/org/apache/nutch/service/resources/ConfigResource.java
+++ b/src/java/org/apache/nutch/service/resources/ConfigResource.java
@@ -88,7 +88,7 @@ public class ConfigResource extends AbstractResource{
/**
* Create new configuration.
- * @param newConfig
+ * @param newConfig a new populated {@link NutchConfig}
* @return The name of the new configuration created
*/
@POST
diff --git a/src/java/org/apache/nutch/service/resources/JobResource.java b/src/java/org/apache/nutch/service/resources/JobResource.java
index 3111b85..0641d21 100644
--- a/src/java/org/apache/nutch/service/resources/JobResource.java
+++ b/src/java/org/apache/nutch/service/resources/JobResource.java
@@ -36,9 +36,9 @@ import org.apache.nutch.service.model.response.JobInfo.State;
public class JobResource extends AbstractResource {
/**
- * Get job history
- * @param crawlId
- * @return A nested JSON object of all the jobs created
+ * Get job history for a given job regardless of the jobs state
+ * @param crawlId a crawlId
+ * @return A nested JSON object of all the jobs created for that crawlId
*/
@GET
@Path(value = "/")
@@ -64,8 +64,8 @@ public class JobResource extends AbstractResource {
/**
* Stop Job
* @param id Job ID
- * @param crawlId
- * @return
+ * @param crawlId Crawl ID
+ * @return true if stopped, false otherwise
*/
@GET
@Path(value = "/{id}/stop")
diff --git a/src/java/org/apache/nutch/service/resources/ReaderResouce.java b/src/java/org/apache/nutch/service/resources/ReaderResouce.java
index b1f9775..f2f52e9 100644
--- a/src/java/org/apache/nutch/service/resources/ReaderResouce.java
+++ b/src/java/org/apache/nutch/service/resources/ReaderResouce.java
@@ -46,7 +46,7 @@ public class ReaderResouce {
/**
* Read a sequence file
- * @param readerConf
+ * @param readerConf A initialized {@link ReaderConfig}
* @param nrows Number of rows to read. If not specified all rows will be read
* @param start Specify a starting line number to read the file from
* @param end The line number to read the file till
@@ -87,12 +87,12 @@ public class ReaderResouce {
/**
* Read link object
- * @param readerConf
- * @param nrows
- * @param start
- * @param end
- * @param count
- * @return
+ * @param readerConf A initialized {@link ReaderConfig}
+ * @param nrows Number of rows to read. If not specified all rows will be read
+ * @param start Specify a starting line number to read the file from
+ * @param end The line number to read the file till
+ * @param count Boolean value. If true, this endpoint will return the number of lines in the line
+ * @return Appropriate HTTP response based on the query
*/
@Path("/link/read")
@POST
@@ -110,7 +110,7 @@ public class ReaderResouce {
/**
* Get schema of the Node object
- * @return
+ * @return JSON object specifying the schema of the responses returned by the Node Reader
*/
@Path("/node")
@GET
@@ -129,12 +129,12 @@ public class ReaderResouce {
/**
* Read Node object as stored in the Nutch Webgraph
- * @param readerConf
- * @param nrows
- * @param start
- * @param end
- * @param count
- * @return
+ * @param readerConf A initialized {@link ReaderConfig}
+ * @param nrows Number of rows to read. If not specified all rows will be read
+ * @param start Specify a starting line number to read the file from
+ * @param end The line number to read the file till
+ * @param count Boolean value. If true, this endpoint will return the number of lines in the line
+ * @return Appropriate HTTP response based on the query
*/
@Path("/node/read")
@POST
diff --git a/src/java/org/apache/nutch/service/resources/SeedResource.java b/src/java/org/apache/nutch/service/resources/SeedResource.java
index 875968d..5c0815b 100644
--- a/src/java/org/apache/nutch/service/resources/SeedResource.java
+++ b/src/java/org/apache/nutch/service/resources/SeedResource.java
@@ -46,7 +46,7 @@ public class SeedResource extends AbstractResource {
/**
* Gets the list of seedFiles already created
- * @return
+ * @return {@link Map} of <String, SeedList>
*/
@GET
@Path("/")
@@ -63,8 +63,8 @@ public class SeedResource extends AbstractResource {
/**
* Method creates seed list file and returns temporary directory path
- * @param seedList
- * @return
+ * @param seedList a populated {@link SeedList}
+ * @return path to seedfile
*/
@POST
@Path("/create")
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
index 0fe6606..9a88f76 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlDataDumper.java
@@ -197,7 +197,7 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool {
* containing one or more segments from which we wish to generate
* CBOR data from. Optionally, 3) a list of mimetypes and the 4)
* the gzip option may be provided.
- * @throws Exception
+ * @throws Exception if there is an error running this {@link NutchTool}
*/
public static void main(String[] args) throws Exception {
Configuration conf = NutchConfiguration.create();
@@ -206,12 +206,16 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool {
}
/**
- * Constructor
+ * Configurable constructor
+ * @param config A populated {@link CommonCrawlConfig}
*/
public CommonCrawlDataDumper(CommonCrawlConfig config) {
this.config = config;
}
+ /**
+ * Constructor
+ */
public CommonCrawlDataDumper() {
}
@@ -227,8 +231,10 @@ public class CommonCrawlDataDumper extends NutchTool implements Tool {
* @param linkdb Path to linkdb.
* @param gzip a boolean flag indicating whether the CBOR content should also
* be gzipped.
+ * @param mimeTypes a string array of mimeTypes to filter on, everything else is excluded
* @param epochFilename if {@code true}, output files will be names using the epoch time (in milliseconds).
* @param extension a file extension to use with output documents.
+ * @param warc if true write as warc format
* @throws Exception if any exception occurs.
*/
public void dump(File outputDir, File segmentRootDir, File linkdb, boolean gzip,
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
index aa2f351..6521966 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormat.java
@@ -34,30 +34,36 @@ import java.util.List;
public interface CommonCrawlFormat extends Closeable {
/**
- * @throws IOException
+ * Get a string representation of the JSON structure of the URL content.
+ * @return the JSON URL content string
+ * @throws IOException if there is a fatal I/O error obtaining JSON data
*/
- //public String getJsonData(boolean mapAll) throws IOException;
public String getJsonData() throws IOException;
/**
- * Returns a string representation of the JSON structure of the URL content
+ * Returns a string representation of the JSON structure of the URL content.
+ * Takes into consideration both the {@link Content} and {@link Metadata}
*
- * @param url
- * @param content
- * @param metadata
- * @return
+ * @param url the canonical url
+ * @param content url {@link Content}
+ * @param metadata url {@link Metadata}
+ * @return the JSON URL content string
+ * @throws IOException if there is a fatal I/O error obtaining JSON data
*/
public String getJsonData(String url, Content content, Metadata metadata)
throws IOException;
/**
- * Returns a string representation of the JSON structure of the URL content
- * takes into account the parsed metadata about the URL
- *
- * @param url
- * @param content
- * @param metadata
- * @return
+ * Returns a string representation of the JSON structure of the URL content.
+ * Takes into consideration the {@link Content}, {@link Metadata} and
+ * {@link ParseData}.
+ *
+ * @param url the canonical url
+ * @param content url {@link Content}
+ * @param metadata url {@link Metadata}
+ * @param parseData url {@link ParseData}
+ * @return the JSON URL content string
+ * @throws IOException if there is a fatal I/O error obtaining JSON data
*/
public String getJsonData(String url, Content content, Metadata metadata,
ParseData parseData) throws IOException;
diff --git a/src/java/org/apache/nutch/tools/DmozParser.java b/src/java/org/apache/nutch/tools/DmozParser.java
index a447646..b68facb 100644
--- a/src/java/org/apache/nutch/tools/DmozParser.java
+++ b/src/java/org/apache/nutch/tools/DmozParser.java
@@ -270,6 +270,18 @@ public class DmozParser {
/**
* Iterate through all the items in this structured DMOZ file. Add each URL to
* the web db.
+ * @param dmozFile the input DMOZ {@link File}
+ * @param subsetDenom Subset denominator filter
+ * @param includeAdult To include adult content or not.
+ * @param skew skew factor the the subset denominator filter.
+ * Only emit with a chance of 1/denominator
+ * @param topicPattern a {@link java.util.regex.Pattern} which
+ * will match again "r:id" element
+ * @throws IOException if there is a fatal error reading the input DMOZ file
+ * @throws SAXException can be thrown if there is an error configuring the
+ * internal {@link SAXParser} or {@link XMLReader}
+ * @throws ParserConfigurationException can be thrown if there is an
+ * error configuring the internal {@link SAXParserFactory}
*/
public void parseDmozFile(File dmozFile, int subsetDenom,
boolean includeAdult, int skew, Pattern topicPattern)
@@ -327,6 +339,9 @@ public class DmozParser {
* Command-line access. User may add URLs via a flat text file or the
* structured DMOZ file. By default, we ignore Adult material (as categorized
* by DMOZ).
+ * @param argv input arguments for this tool. If less than one
+ * argument is provided the tool will print help.
+ * @throws Exception if there is a fatal error
*/
public static void main(String[] argv) throws Exception {
if (argv.length < 1) {
diff --git a/src/java/org/apache/nutch/tools/FileDumper.java b/src/java/org/apache/nutch/tools/FileDumper.java
index 316b977..4e7338e 100644
--- a/src/java/org/apache/nutch/tools/FileDumper.java
+++ b/src/java/org/apache/nutch/tools/FileDumper.java
@@ -127,10 +127,11 @@ public class FileDumper {
* @param mimeTypeStats
* a flag indicating whether mimetype stats should be displayed
* instead of dumping files.
- * @throws Exception
+ * @param reverseURLDump whether to reverse the URLs when they are written to disk
+ * @throws Exception if there is a fatal error dumping files to disk
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean flatDir, boolean mimeTypeStats, boolean reverseURLDump)
- throws Exception {
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes, boolean
+ flatDir, boolean mimeTypeStats, boolean reverseURLDump) throws Exception {
if (mimeTypes == null)
LOG.info("Accepting all mimetypes.");
// total file counts
@@ -300,7 +301,7 @@ public class FileDumper {
* @param args
* 1) output directory (which will be created) to host the raw data
* and 2) a directory containing one or more segments.
- * @throws Exception
+ * @throws Exception if there is a fatal error running this tool
*/
public static void main(String[] args) throws Exception {
// boolean options
diff --git a/src/java/org/apache/nutch/tools/ResolveUrls.java b/src/java/org/apache/nutch/tools/ResolveUrls.java
index a7b2930..92077a2 100644
--- a/src/java/org/apache/nutch/tools/ResolveUrls.java
+++ b/src/java/org/apache/nutch/tools/ResolveUrls.java
@@ -159,6 +159,8 @@ public class ResolveUrls {
/**
* Runs the resolve urls tool.
+ * @param args the input arguments for this tool. Running
+ * with 'help' will print parameter options.
*/
public static void main(String[] args) {
diff --git a/src/java/org/apache/nutch/tools/WARCUtils.java b/src/java/org/apache/nutch/tools/WARCUtils.java
index 1af6533..6b6f1cf 100644
--- a/src/java/org/apache/nutch/tools/WARCUtils.java
+++ b/src/java/org/apache/nutch/tools/WARCUtils.java
@@ -183,6 +183,7 @@ public class WARCUtils {
* @param headers
* HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
* first line is status line
+ * @param contentLength Effective uncompressed and unchunked length of content
* @return safe HTTP response header
*/
public static final String fixHttpHeaders(String headers, int contentLength) {
diff --git a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
index 3b1593b..7a3ba83 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
@@ -40,7 +40,7 @@ public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> {
return new SequenceFileRecordReader<Text, BytesWritable>();
}
/**
- * Returns the <code>RecordReader</code> for reading the arc file.
+ * Get the <code>RecordReader</code> for reading the arc file.
*
* @param split
* The InputSplit of the arc file to process.
@@ -48,6 +48,9 @@ public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> {
* The job configuration.
* @param context
* The task context.
+ * @return A configured {@link ArcRecordReader}
+ * @throws IOException if there is a fatal I/O error reading
+ * the {@link InputSplit}
*/
public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
Job job, Context context) throws IOException {
diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
index b5f7a44..0a93947 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
@@ -34,7 +34,6 @@ import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.hadoop.util.StringUtils;
/**
* The <code>ArchRecordReader</code> class provides a record reader which reads
@@ -131,6 +130,7 @@ public class ArcRecordReader extends RecordReader<Text, BytesWritable> {
/**
* Creates a new instance of the <code>Text</code> object for the key.
+ * @return {@link Text}
*/
public Text createKey() {
return ReflectionUtils.newInstance(Text.class, conf);
@@ -138,6 +138,7 @@ public class ArcRecordReader extends RecordReader<Text, BytesWritable> {
/**
* Creates a new instance of the <code>BytesWritable</code> object for the key
+ * @return {@link BytesWritable}
*/
public BytesWritable createValue() {
return ReflectionUtils.newInstance(BytesWritable.class, conf);
@@ -147,6 +148,8 @@ public class ArcRecordReader extends RecordReader<Text, BytesWritable> {
* Returns the current position in the file.
*
* @return The long of the current position in the file.
+ * @throws IOException if there is a fatal I/O error reading
+ * the position within the {@link FSDataInputStream}
*/
public long getPos() throws IOException {
return in.getPos();
diff --git a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
index 7a26748..c0ebb2d 100644
--- a/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
+++ b/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
@@ -83,11 +83,9 @@ public class ArcSegmentCreator extends Configured implements Tool {
}
/**
- * <p>
* Constructor that sets the job configuration.
- * </p>
*
- * @param conf
+ * @param conf a populated {@link Configuration}
*/
public ArcSegmentCreator(Configuration conf) {
setConf(conf);
@@ -110,9 +108,7 @@ public class ArcSegmentCreator extends Configured implements Tool {
}
/**
- * <p>
* Logs any error that occurs during conversion.
- * </p>
*
* @param url
* The url we are parsing.
@@ -138,7 +134,6 @@ public class ArcSegmentCreator extends Configured implements Tool {
private int interval;
/**
- * <p>
* Parses the raw content of a single record to create output. This method is
* almost the same as the {@link org.apache.nutch.Fetcher#output} method in
* terms of processing and output.
@@ -264,10 +259,8 @@ public class ArcSegmentCreator extends Configured implements Tool {
}
/**
- * <p>
* Configures the job mapper. Sets the url filters, scoring filters, url normalizers
* and other relevant data.
- * </p>
*
* @param context
* The task context.
@@ -284,9 +277,7 @@ public class ArcSegmentCreator extends Configured implements Tool {
}
/**
- * <p>
* Runs the Map job to translate an arc record into output for Nutch segments.
- * </p>
*
* @param key
* The arc record header.
@@ -359,9 +350,7 @@ public class ArcSegmentCreator extends Configured implements Tool {
}
/**
- * <p>
* Creates the arc files to segments job.
- * </p>
*
* @param arcFiles
* The path to the directory holding the arc files
@@ -370,6 +359,9 @@ public class ArcSegmentCreator extends Configured implements Tool {
*
* @throws IOException
* If an IO error occurs while running the job.
+ * @throws InterruptedException if this {@link Job} is interrupted
+ * @throws ClassNotFoundException if there is an error locating a
+ * class during runtime
*/
public void createSegments(Path arcFiles, Path segmentsOutDir)
throws IOException, InterruptedException, ClassNotFoundException {
diff --git a/src/java/org/apache/nutch/util/CommandRunner.java b/src/java/org/apache/nutch/util/CommandRunner.java
index ae0a224..f9dcb61 100644
--- a/src/java/org/apache/nutch/util/CommandRunner.java
+++ b/src/java/org/apache/nutch/util/CommandRunner.java
@@ -15,19 +15,6 @@
* limitations under the License.
*/
-/*
- * Adopted by John Xing for Nutch Project from
- * http://blog.fivesight.com/prb/space/Call+an+External+Command+from+Java/,
- * which explains the code in detail.
- * [Original author is moving his site to http://mult.ifario.us/ -peb]
- *
- * Comments by John Xing on 20040621:
- * (1) EDU.oswego.cs.dl.util.concurrent.* is in j2sdk 1.5 now.
- * Modifications are needed if we move to j2sdk 1.5.
- * (2) The original looks good, not much to change.
- *
- * This code is in the public domain and comes with no warranty.
- */
package org.apache.nutch.util;
import java.io.IOException;
@@ -86,9 +73,10 @@ public class CommandRunner {
}
/**
- *
+ * Execute the command
* @return process exit value (return code) or -1 if timed out.
- * @throws IOException
+ * @throws IOException if there is a fatal error interfacing
+ * with the environment in which the application is running.
*/
public int exec() throws IOException {
Process proc = Runtime.getRuntime().exec(_command);
diff --git a/src/java/org/apache/nutch/util/DeflateUtils.java b/src/java/org/apache/nutch/util/DeflateUtils.java
index 11bb29f..5086025 100644
--- a/src/java/org/apache/nutch/util/DeflateUtils.java
+++ b/src/java/org/apache/nutch/util/DeflateUtils.java
@@ -41,6 +41,8 @@ public class DeflateUtils {
* Returns an inflated copy of the input array. If the deflated input has been
* truncated or corrupted, a best-effort attempt is made to inflate as much as
* possible. If no data can be extracted <code>null</code> is returned.
+ * @param in Deflated byte array
+ * @return An inflated copy of the input array, otherwise null
*/
public static final byte[] inflateBestEffort(byte[] in) {
return inflateBestEffort(in, Integer.MAX_VALUE);
@@ -51,6 +53,9 @@ public class DeflateUtils {
* <code>sizeLimit</code> bytes, if necessary. If the deflated input has been
* truncated or corrupted, a best-effort attempt is made to inflate as much as
* possible. If no data can be extracted <code>null</code> is returned.
+ * @param in Deflated byte array
+ * @param sizeLimit Maximum size (bytes) allowed for inflated copy
+ * @return An inflated copy of the input array, otherwise null
*/
public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) {
// decompress using InflaterInputStream
@@ -90,7 +95,8 @@ public class DeflateUtils {
/**
* Returns an inflated copy of the input array.
- *
+ * @param in Deflated byte array
+ * @return An inflated copy of the input array, otherwise null
* @throws IOException
* if the input cannot be properly decompressed
*/
@@ -116,6 +122,8 @@ public class DeflateUtils {
/**
* Returns a deflated copy of the input array.
+ * @param in Inflated byte array
+ * @return A deflated copy of the input array, otherwise null
*/
public static final byte[] deflate(byte[] in) {
// compress using DeflaterOutputStream
diff --git a/src/java/org/apache/nutch/util/DomUtil.java b/src/java/org/apache/nutch/util/DomUtil.java
index d0bfafd..50cc43c 100644
--- a/src/java/org/apache/nutch/util/DomUtil.java
+++ b/src/java/org/apache/nutch/util/DomUtil.java
@@ -20,13 +20,11 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
-import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
@@ -51,7 +49,7 @@ public class DomUtil {
/**
* Returns parsed dom tree or null if any error
*
- * @param is
+ * @param is XML {@link InputStream}
* @return A parsed DOM tree from the given {@link InputStream}.
*/
public static Element getDom(InputStream is) {
@@ -81,10 +79,10 @@ public class DomUtil {
}
/**
- * save dom into ouputstream
+ * Save dom into {@link OutputStream}
*
- * @param os
- * @param e
+ * @param os Output DOM XML stream to save to
+ * @param e A specific DOM {@link org.w3c.dom.Element} to save
*/
public static void saveDom(OutputStream os, Element e) {
@@ -104,6 +102,12 @@ public class DomUtil {
}
}
+ /**
+ * Save dom into {@link OutputStream}
+ *
+ * @param os Output DOM XML stream to save to
+ * @param doc A specific DOM {@link org.w3c.dom.DocumentFragment} to save
+ */
public static void saveDom(OutputStream os, DocumentFragment doc) {
NodeList docChildren = doc.getChildNodes();
for (int i = 0; i < docChildren.getLength(); i++) {
diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java
index 2b28447..4e921f4 100644
--- a/src/java/org/apache/nutch/util/EncodingDetector.java
+++ b/src/java/org/apache/nutch/util/EncodingDetector.java
@@ -324,6 +324,8 @@ public class EncodingDetector {
*
* @param contentType
* a content type header
+ * @return a trimmed string representation of the 'charset=' value,
+ * null if this is not available
*/
public static String parseCharacterEncoding(String contentType) {
if (contentType == null)
diff --git a/src/java/org/apache/nutch/util/GZIPUtils.java b/src/java/org/apache/nutch/util/GZIPUtils.java
index dc40a7f..7daa6a7 100644
--- a/src/java/org/apache/nutch/util/GZIPUtils.java
+++ b/src/java/org/apache/nutch/util/GZIPUtils.java
@@ -40,6 +40,8 @@ public class GZIPUtils {
* Returns an gunzipped copy of the input array. If the gzipped input has been
* truncated or corrupted, a best-effort attempt is made to unzip as much as
* possible. If no data can be extracted <code>null</code> is returned.
+ * @param in byte array to gunzipped
+ * @return the gunzipped byte array, or null
*/
public static final byte[] unzipBestEffort(byte[] in) {
return unzipBestEffort(in, Integer.MAX_VALUE);
@@ -50,6 +52,9 @@ public class GZIPUtils {
* <code>sizeLimit</code> bytes, if necessary. If the gzipped input has been
* truncated or corrupted, a best-effort attempt is made to unzip as much as
* possible. If no data can be extracted <code>null</code> is returned.
+ * @param in Byte array to gunzip
+ * @param sizeLimit Maximum size allowed for gunzipped byte array
+ * @return the gunzipped byte array, or null
*/
public static final byte[] unzipBestEffort(byte[] in, int sizeLimit) {
try {
@@ -91,7 +96,8 @@ public class GZIPUtils {
/**
* Returns an gunzipped copy of the input array.
- *
+ * @param in Byte array to gunzip
+ * @return the gunzipped byte array
* @throws IOException
* if the input cannot be properly decompressed
*/
@@ -116,6 +122,8 @@ public class GZIPUtils {
/**
* Returns an gzipped copy of the input array.
+ * @param in Byte array to zip
+ * @return the zipped byte array
*/
public static final byte[] zip(byte[] in) {
try {
diff --git a/src/java/org/apache/nutch/util/HadoopFSUtil.java b/src/java/org/apache/nutch/util/HadoopFSUtil.java
index 53c506d..08984ce 100644
--- a/src/java/org/apache/nutch/util/HadoopFSUtil.java
+++ b/src/java/org/apache/nutch/util/HadoopFSUtil.java
@@ -26,14 +26,18 @@ import org.apache.hadoop.fs.PathFilter;
public class HadoopFSUtil {
/**
- * Returns PathFilter that passes all paths through.
+ * Get a path filter which allows all paths.
+ * @return {@link PathFilter}
*/
public static PathFilter getPassAllFilter() {
return arg0 -> true;
}
/**
- * Returns PathFilter that passes directories through.
+ * Get a path filter which allows all directories.
+ * @param fs A {@link org.apache.hadoop.fs.FileSystem} used
+ * to determine directories.
+ * @return {@link PathFilter}
*/
public static PathFilter getPassDirectoriesFilter(final FileSystem fs) {
return path -> {
@@ -47,6 +51,9 @@ public class HadoopFSUtil {
/**
* Turns an array of FileStatus into an array of Paths.
+ * May return null if input is null.
+ * @param stats A {@link org.apache.hadoop.fs.FileStatus} array
+ * @return {@link org.apache.hadoop.fs.Path} array
*/
public static Path[] getPaths(FileStatus[] stats) {
if (stats == null) {
diff --git a/src/java/org/apache/nutch/util/NodeWalker.java b/src/java/org/apache/nutch/util/NodeWalker.java
index ff61ced..9296d47 100644
--- a/src/java/org/apache/nutch/util/NodeWalker.java
+++ b/src/java/org/apache/nutch/util/NodeWalker.java
@@ -36,15 +36,15 @@ import org.w3c.dom.NodeList;
*/
public class NodeWalker {
- // the root node the the stack holding the nodes
+ // the root node of the stack holding the nodes
private Node currentNode;
private NodeList currentChildren;
private Stack<Node> nodes;
/**
- * Starts the <code>Node</code> tree from the root node.
+ * Starts the {@link org.w3c.dom.Node} tree from the root node.
*
- * @param rootNode
+ * @param rootNode A root node which will contain all nodes
*/
public NodeWalker(Node rootNode) {
diff --git a/src/java/org/apache/nutch/util/NutchConfiguration.java b/src/java/org/apache/nutch/util/NutchConfiguration.java
index 4089aec..6277846 100644
--- a/src/java/org/apache/nutch/util/NutchConfiguration.java
+++ b/src/java/org/apache/nutch/util/NutchConfiguration.java
@@ -58,6 +58,7 @@ public class NutchConfiguration {
* Create a {@link Configuration} for Nutch. This will load the standard Nutch
* resources, <code>nutch-default.xml</code> and <code>nutch-site.xml</code>
* overrides.
+ * @return A populated {@link org.apache.hadoop.conf.Configuration}
*/
public static Configuration create() {
Configuration conf = new Configuration();
@@ -75,6 +76,7 @@ public class NutchConfiguration {
* properties. Otherwise these resources won't be used.
* @param nutchProperties
* a set of properties to define (or override)
+ * @return A populated {@link org.apache.hadoop.conf.Configuration}
*/
public static Configuration create(boolean addNutchResources,
Properties nutchProperties) {
diff --git a/src/java/org/apache/nutch/util/NutchJob.java b/src/java/org/apache/nutch/util/NutchJob.java
index 13257d2..04b38df 100644
--- a/src/java/org/apache/nutch/util/NutchJob.java
+++ b/src/java/org/apache/nutch/util/NutchJob.java
@@ -43,6 +43,12 @@ public class NutchJob extends Job {
/**
* Clean up the file system in case of a job failure.
+ * @param tempDir The temporary directory which needs to be
+ * deleted/cleaned-up.
+ * @param fs The {@link org.apache.hadoop.fs.FileSystem} on which
+ * the tempDir resides.
+ * @throws IOException Occurs if there is fatal I/O error whilst performing
+ * the cleanup.
*/
public static void cleanupAfterFailure(Path tempDir, FileSystem fs)
throws IOException {
@@ -51,6 +57,14 @@ public class NutchJob extends Job {
/**
* Clean up the file system in case of a job failure.
+ * @param tempDir The temporary directory which needs to be
+ * deleted/cleaned-up.
+ * @param lock A lockfile if one exists.
+ * @see LockUtil#removeLockFile(FileSystem, Path)
+ * @param fs The {@link org.apache.hadoop.fs.FileSystem} on which
+ * the tempDir resides.
+ * @throws IOException Occurs if there is fatal I/O error whilst performing
+ * the cleanup.
*/
public static void cleanupAfterFailure(Path tempDir, Path lock, FileSystem fs)
throws IOException {
diff --git a/src/java/org/apache/nutch/util/NutchTool.java b/src/java/org/apache/nutch/util/NutchTool.java
index f7b0b76..244ae99 100644
--- a/src/java/org/apache/nutch/util/NutchTool.java
+++ b/src/java/org/apache/nutch/util/NutchTool.java
@@ -37,6 +37,11 @@ public abstract class NutchTool extends Configured {
/**
* Runs the tool, using a map of arguments. May return results, or null.
+ * @param args a {@link Map} of arguments to be run with the tool
+ * @param crawlId a crawl identifier to associate with the tool invocation
+ * @return Map results object if tool executes successfully
+ * otherwise null
+ * @throws Exception if there is an error during the tool execution
*/
public abstract Map<String, Object> run(Map<String, Object> args, String crawlId)
throws Exception;
@@ -49,7 +54,11 @@ public abstract class NutchTool extends Configured {
super(null);
}
- /** Returns relative progress of the tool, a float in range [0,1]. */
+ /**
+ * Get relative progress of the tool. Progress is represented as a
+ * float in range [0,1] where 1 is complete.
+ * @return a float in range [0,1].
+ */
public float getProgress() {
float res = 0;
if (currentJob != null) {
@@ -71,7 +80,11 @@ public abstract class NutchTool extends Configured {
return res;
}
- /** Returns current status of the running tool. */
+ /**
+ * Returns current status of the running tool
+ * @return a populated {@link Map}, the fields
+ * of which can be accessed to obtain status.
+ */
public Map<String, Object> getStatus() {
return status;
}
@@ -81,6 +94,8 @@ public abstract class NutchTool extends Configured {
* this, since by default it calls {@link #killJob()}.
*
* @return true if succeeded, false otherwise
+ * @throws Exception if there is an error stopping the current
+ * {@link org.apache.hadoop.mapreduce.Job}
*/
public boolean stopJob() throws Exception {
return killJob();
@@ -88,10 +103,11 @@ public abstract class NutchTool extends Configured {
/**
* Kill the job immediately. Clients should assume that any results that the
- * job produced so far are in inconsistent state or missing.
+ * job produced so far are in an inconsistent state or missing.
*
* @return true if succeeded, false otherwise.
- * @throws Exception
+ * @throws Exception if there is an error stopping the current
+ * {@link org.apache.hadoop.mapreduce.Job}
*/
public boolean killJob() throws Exception {
if (currentJob != null && !currentJob.isComplete()) {
diff --git a/src/java/org/apache/nutch/util/PrefixStringMatcher.java b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
index 3be0fd7..3f4863f 100644
--- a/src/java/org/apache/nutch/util/PrefixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/PrefixStringMatcher.java
@@ -32,6 +32,7 @@ public class PrefixStringMatcher extends TrieStringMatcher {
* Creates a new <code>PrefixStringMatcher</code> which will match
* <code>String</code>s with any prefix in the supplied array. Zero-length
* <code>Strings</code> are ignored.
+ * @param prefixes A array containing string prefixes
*/
public PrefixStringMatcher(String[] prefixes) {
super();
@@ -42,8 +43,8 @@ public class PrefixStringMatcher extends TrieStringMatcher {
/**
* Creates a new <code>PrefixStringMatcher</code> which will match
* <code>String</code>s with any prefix in the supplied
- * <code>Collection</code>.
- *
+ * {@link Collection}.
+ * @param prefixes A collection containing string prefixes
* @throws ClassCastException
* if any <code>Object</code>s in the collection are not
* <code>String</code>s
diff --git a/src/java/org/apache/nutch/util/StringUtil.java b/src/java/org/apache/nutch/util/StringUtil.java
index b63364d..10ff51c 100644
--- a/src/java/org/apache/nutch/util/StringUtil.java
+++ b/src/java/org/apache/nutch/util/StringUtil.java
@@ -22,9 +22,13 @@ package org.apache.nutch.util;
public class StringUtil {
/**
- * Returns a copy of <code>s</code> padded with trailing spaces so that it's
- * length is <code>length</code>. Strings already <code>length</code>
- * characters long or longer are not altered.
+ * Returns a copy of <code>s</code> (right padded) with trailing
+ * spaces so that it's length is <code>length</code>.
+ * Strings already <code>length</code> characters long or longer
+ * are not altered.
+ * @param s input string to be copied and processed
+ * @param length desired final length of padded string
+ * @return the resulting padded string
*/
public static String rightPad(String s, int length) {
StringBuffer sb = new StringBuffer(s);
@@ -34,9 +38,12 @@ public class StringUtil {
}
/**
- * Returns a copy of <code>s</code> padded with leading spaces so that it's
- * length is <code>length</code>. Strings already <code>length</code>
- * characters long or longer are not altered.
+ * Returns a copy of <code>s</code> (left padded) with leading
+ * spaces so that it's length is <code>length</code>. Strings
+ * already <code>length</code> characters long or longer are not altered.
+ * @param s input string to be copied and processed
+ * @param length desired final length of padded string
+ * @return the resulting padded string
*/
public static String leftPad(String s, int length) {
StringBuffer sb = new StringBuffer();
@@ -53,7 +60,8 @@ public class StringUtil {
* Convenience call for {@link #toHexString(byte[], String, int)}, where
* <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
*
- * @param buf
+ * @param buf input data for which to generate a hex string
+ * @return the hex string
*/
public static String toHexString(byte[] buf) {
return toHexString(buf, null, Integer.MAX_VALUE);
@@ -63,14 +71,14 @@ public class StringUtil {
* Get a text representation of a byte[] as hexadecimal String, where each
* pair of hexadecimal digits corresponds to consecutive bytes in the array.
*
- * @param buf
- * input data
+ * @param buf input data for which to generate a hex string
* @param sep
* separate every pair of hexadecimal digits with this separator, or
* null if no separation is needed.
* @param lineLen
* break the output String into lines containing output for lineLen
* bytes.
+ * @return the hex string
*/
public static String toHexString(byte[] buf, String sep, int lineLen) {
if (buf == null)
@@ -132,13 +140,17 @@ public class StringUtil {
/**
* Checks if a string is empty (ie is null or empty).
+ * @param str the String to check for being empty or null
+ * @return true if empty or null, false otherwise
*/
public static boolean isEmpty(String str) {
return (str == null) || (str.equals(""));
}
/**
- * Simple character substitution which cleans all � chars from a given String.
+ * Simple character substitution which cleans/removes all � chars from a given String.
+ * @param value the String to clean
+ * @return substituted cleaned string
*/
public static String cleanField(String value) {
return value.replaceAll("�", "");
diff --git a/src/java/org/apache/nutch/util/SuffixStringMatcher.java b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
index 46df52a..1bf0774 100644
--- a/src/java/org/apache/nutch/util/SuffixStringMatcher.java
+++ b/src/java/org/apache/nutch/util/SuffixStringMatcher.java
@@ -28,6 +28,7 @@ public class SuffixStringMatcher extends TrieStringMatcher {
/**
* Creates a new <code>PrefixStringMatcher</code> which will match
* <code>String</code>s with any suffix in the supplied array.
+ * @param suffixes A array containing string suffixes
*/
public SuffixStringMatcher(String[] suffixes) {
super();
@@ -39,6 +40,7 @@ public class SuffixStringMatcher extends TrieStringMatcher {
* Creates a new <code>PrefixStringMatcher</code> which will match
* <code>String</code>s with any suffix in the supplied
* <code>Collection</code>
+ * @param suffixes A {@link Collection} containing string suffixes
*/
public SuffixStringMatcher(Collection<String> suffixes) {
super();
diff --git a/src/java/org/apache/nutch/util/TableUtil.java b/src/java/org/apache/nutch/util/TableUtil.java
index 1414d15..7b0b5ce 100644
--- a/src/java/org/apache/nutch/util/TableUtil.java
+++ b/src/java/org/apache/nutch/util/TableUtil.java
@@ -36,7 +36,7 @@ public class TableUtil {
* @param urlString
* url to be reversed
* @return Reversed url
- * @throws MalformedURLException
+ * @throws MalformedURLException if the input urlString is malformed
*/
public static String reverseUrl(String urlString)
throws MalformedURLException {
diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java
index 3f3e74e..6ae9441 100644
--- a/src/java/org/apache/nutch/util/TimingUtil.java
+++ b/src/java/org/apache/nutch/util/TimingUtil.java
@@ -30,6 +30,8 @@ public class TimingUtil {
/**
* Convert epoch milliseconds ({@link System#currentTimeMillis()}) into date
* string (local time zone) used for logging
+ * @param millis A epoch milliseconds representation
+ * @return a local time zone date string
*/
public static String logDateMillis(long millis) {
return logDateFormat.format(
diff --git a/src/java/org/apache/nutch/util/TrieStringMatcher.java b/src/java/org/apache/nutch/util/TrieStringMatcher.java
index d974ecb..20cd848 100644
--- a/src/java/org/apache/nutch/util/TrieStringMatcher.java
+++ b/src/java/org/apache/nutch/util/TrieStringMatcher.java
@@ -152,9 +152,14 @@ public abstract class TrieStringMatcher {
}
/**
- * Returns the next {@link TrieNode} visited, given that you are at
- * <code>node</code>, and the the next character in the input is the
- * <code>idx</code>'th character of <code>s</code>.
+ * Get the next {@link TrieNode} visited, given that you are at
+ * <code>node</code>, and that the next character in the input is the
+ * <code>idx</code>'th character of <code>s</code>. Can return null.
+ * @see TrieNode#getChild(char)
+ * @param node Input {@link TrieNode} containing child nodes
+ * @param s String to match character at indexed position
+ * @param idx Indexed position in input string
+ * @return child {@link TrieNode}
*/
protected final TrieNode matchChar(TrieNode node, String s, int idx) {
return node.getChild(s.charAt(idx));
@@ -164,6 +169,7 @@ public abstract class TrieStringMatcher {
* Adds any necessary nodes to the trie so that the given <code>String</code>
* can be decoded and the last character is represented by a terminal node.
* Zero-length <code>Strings</code> are ignored.
+ * @param s String to be decoded.
*/
protected final void addPatternForward(String s) {
TrieNode node = root;
@@ -180,6 +186,7 @@ public abstract class TrieStringMatcher {
* Adds any necessary nodes to the trie so that the given <code>String</code>
* can be decoded <em>in reverse</em> and the first character is represented
* by a terminal node. Zero-length <code>Strings</code> are ignored.
+ * @param s String to be decoded.
*/
protected final void addPatternBackward(String s) {
TrieNode node = root;
@@ -193,6 +200,8 @@ public abstract class TrieStringMatcher {
/**
* Returns true if the given <code>String</code> is matched by a pattern in
* the trie
+ * @param input A String to be matched by a pattern
+ * @return true if there is a match, flase otherwise
*/
public abstract boolean matches(String input);
@@ -200,6 +209,8 @@ public abstract class TrieStringMatcher {
* Returns the shortest substring of <code>input</code> that is
* matched by a pattern in the trie, or <code>null</code> if no match
* exists.
+ * @param input A String to be matched by a pattern
+ * @return shortest string match or null if no match is made
*/
public abstract String shortestMatch(String input);
@@ -207,6 +218,8 @@ public abstract class TrieStringMatcher {
* Returns the longest substring of <code>input</code> that is
* matched by a pattern in the trie, or <code>null</code> if no match
* exists.
+ * @param input A String to be matched by a pattern
+ * @return longest string match or null if no match is made
*/
public abstract String longestMatch(String input);
diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java
index e500f5a..24a4cf0 100644
--- a/src/java/org/apache/nutch/util/URLUtil.java
+++ b/src/java/org/apache/nutch/util/URLUtil.java
@@ -37,7 +37,7 @@ public class URLUtil {
* @param target
* target url (may be relative)
* @return resolved absolute url.
- * @throws MalformedURLException
+ * @throws MalformedURLException if the input base URL is malformed
*/
public static URL resolveURL(URL base, String target)
throws MalformedURLException {
@@ -84,13 +84,15 @@ public class URLUtil {
.compile("(\\d{1,3}\\.){3}(\\d{1,3})");
/**
- * Returns the domain name of the url. The domain name of a url is the
+ * Get the domain name of the url. The domain name of a url is the
* substring of the url's hostname, w/o subdomain names. As an example <br>
* <code>
- * getDomainName(conf, new URL(http://lucene.apache.org/))
+ * getDomainName(new URL(http://lucene.apache.org/))
* </code><br>
* will return <br>
- * <code> apache.org</code>
+ * <code>apache.org</code>
+ * @param url A input {@link URL} to extract the domain from
+ * @return the domain name string
* */
public static String getDomainName(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
@@ -122,8 +124,9 @@ public class URLUtil {
* </code><br>
* will return <br>
* <code> apache.org</code>
- *
- * @throws MalformedURLException
+ * @param url A input url string to extract the domain from
+ * @return the domain name
+ * @throws MalformedURLException if the input url is malformed
*/
public static String getDomainName(String url) throws MalformedURLException {
return getDomainName(new URL(url));
@@ -139,7 +142,10 @@ public class URLUtil {
* will return <br>
* <code> org</code>
*
- * @throws MalformedURLException
+ * @param url A input {@link URL} to extract the top
+ * level domain name from
+ * @return the top level domain name
+ * @throws MalformedURLException if the input url is malformed
*/
public static String getTopLevelDomainName(URL url)
throws MalformedURLException {
@@ -162,7 +168,10 @@ public class URLUtil {
* will return <br>
* <code> org</code>
*
- * @throws MalformedURLException
+ * @param url A input url string to extract the top
+ * level domain name from
+ * @return the top level domain name
+ * @throws MalformedURLException if the input url is malformed
*/
public static String getTopLevelDomainName(String url)
throws MalformedURLException {
@@ -174,6 +183,8 @@ public class URLUtil {
* <code> isSameDomain(new URL("http://lucene.apache.org")
* , new URL("http://people.apache.org/"))
* <br> will return true. </code>
+ * @param url1 first {@link URL} to compare domain name
+ * @param url2 second {@link URL} to compare domain name
*
* @return true if the domain names are equal
*/
@@ -186,9 +197,10 @@ public class URLUtil {
* <code> isSameDomain("http://lucene.apache.org"
* ,"http://people.apache.org/")
* <br> will return true. </code>
- *
+ * @param url1 first url string to compare domain name
+ * @param url2 second url string to compare domain name
* @return true if the domain names are equal
- * @throws MalformedURLException
+ * @throws MalformedURLException if either of the input urls are malformed
*/
public static boolean isSameDomainName(String url1, String url2)
throws MalformedURLException {
@@ -198,6 +210,8 @@ public class URLUtil {
/**
* Returns the {@link DomainSuffix} corresponding to the last public part of
* the hostname
+ * @param url a {@link URL} to extract the domain suffix from
+ * @return a {@link org.apache.nutch.util.domain.DomainSuffix}
*/
public static DomainSuffix getDomainSuffix(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
@@ -222,13 +236,20 @@ public class URLUtil {
/**
* Returns the {@link DomainSuffix} corresponding to the last public part of
* the hostname
+ * @param url a {@link URL} to extract the domain suffix from
+ * @return a {@link org.apache.nutch.util.domain.DomainSuffix}
+ * @throws MalformedURLException if the input url string is malformed
*/
public static DomainSuffix getDomainSuffix(String url)
throws MalformedURLException {
return getDomainSuffix(new URL(url));
}
- /** Partitions of the hostname of the url by "." */
+ /**
+ * Partitions of the hostname of the url by "."
+ * @param url a {@link URL} to extract host segments from
+ * @return a string array of host segments
+ */
public static String[] getHostSegments(URL url) {
String host = url.getHost();
// return whole hostname, if it is an ipv4
@@ -240,8 +261,9 @@ public class URLUtil {
/**
* Partitions of the hostname of the url by "."
- *
- * @throws MalformedURLException
+ * @param url a url string to extract host segments from
+ * @return a string array of host segments
+ * @throws MalformedURLException if the input url string is malformed
*/
public static String[] getHostSegments(String url)
throws MalformedURLException {
@@ -497,7 +519,10 @@ public class URLUtil {
}
}
- /** For testing */
+ /**
+ * For testing
+ * @param args print with no args to get help
+ */
public static void main(String[] args) {
if (args.length != 1) {
diff --git a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
index 9047ecf..ae0d31b 100644
--- a/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
+++ b/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
@@ -66,7 +66,11 @@ public class DomainSuffixes {
domains.put(tld.getDomain(), tld);
}
- /** return whether the extension is a registered domain entry */
+ /**
+ * Return whether the extension is a registered domain entry
+ * @param extension a String extension
+ * @return true if input is a registered domain entry, false otherwise
+ */
public boolean isDomainSuffix(String extension) {
return domains.containsKey(extension);
}
@@ -78,6 +82,7 @@ public class DomainSuffixes {
*
* @param extension
* of the domain
+ * @return {@link DomainSuffix}
*/
public DomainSuffix get(String extension) {
return domains.get(extension);
diff --git a/src/java/org/apache/nutch/webui/client/NutchClient.java b/src/java/org/apache/nutch/webui/client/NutchClient.java
index bd8072e..5b2dab3 100644
--- a/src/java/org/apache/nutch/webui/client/NutchClient.java
+++ b/src/java/org/apache/nutch/webui/client/NutchClient.java
@@ -42,8 +42,8 @@ public interface NutchClient {
/**
* Create seed list and return seed directory location
*
- * @param seedList
- * @return
+ * @param seedList a populated {@link org.apache.nutch.webui.model.SeedList}
+ * @return a JSON HTTP response indicating the seed creation result
*/
public String createSeed(SeedList seedList);
}
diff --git a/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java b/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
index 9ffa77b..ae31003 100644
--- a/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
+++ b/src/java/org/apache/nutch/webui/pages/components/CpmIteratorAdapter.java
@@ -26,7 +26,6 @@ import org.apache.wicket.model.IModel;
*
* @author feodor
*
- * @param <T>
*/
public class CpmIteratorAdapter<T> extends ModelIteratorAdapter<T> {
public CpmIteratorAdapter(Iterable<T> iterable) {
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
index 27c38ef..e56aaa6 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23IndexingFilter.java
@@ -36,7 +36,7 @@ import org.slf4j.LoggerFactory;
* <p>This implementation of {@link org.apache.nutch.indexer.IndexingFilter}
* adds a <i>triple(s)</i> field to the {@link org.apache.nutch.indexer.NutchDocument}.</p>
* <p>Triples are extracted via <a href="https://any23.apache.org/">Apache Any23</a>.</p>
- * @see {@link org.apache.nutch.any23.Any23ParseFilter}.
+ * @see org.apache.nutch.any23.Any23ParseFilter
*/
public class Any23IndexingFilter implements IndexingFilter {
@@ -81,7 +81,7 @@ public class Any23IndexingFilter implements IndexingFilter {
* @return filtered NutchDocument
* @see org.apache.nutch.indexer.IndexingFilter#filter(NutchDocument, Parse, Text, CrawlDatum, Inlinks)
*
- * @throws IndexingException
+ * @throws IndexingException if there is a fatl error whilst indexing
*/
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
diff --git a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
index 7fc32a8..d9f0896 100644
--- a/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
+++ b/src/plugin/any23/src/java/org/apache/nutch/any23/Any23ParseFilter.java
@@ -47,15 +47,13 @@ import org.w3c.dom.DocumentFragment;
* uses the <a href="https://any23.apache.org/">Apache Any23</a> library
* for parsing and extracting structured data in RDF format from a
* variety of Web documents. The supported formats can be found at <a href="https://any23.apache.org/">Apache Any23</a>.
- * <p>In this implementation triples are written as Notation3 e.g.
- * <code><http://www.bbc.co.uk/news/scotland/> <http://iptc.org/std/rNews/2011-10-07#datePublished> "2014/03/31 13:53:03"@en-gb .</code>
+ * <p>In this implementation triples are written as <a href="https://www.w3.org/TeamSubmission/n3/">Notation3</a>
* and triples are identified within output triple streams by the presence of '\n'.
* The presence of the '\n' is a characteristic specific to N3 serialization in Any23.
* In order to use another/other writers implementing the
* <a href="https://any23.apache.org/apidocs/index.html?org/apache/any23/writer/TripleHandler.html">TripleHandler</a>
* interface, we will most likely need to identify an alternative data characteristic
* which we can use to split triples streams.</p>
- * <p>
*/
public class Any23ParseFilter implements HtmlParseFilter {
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
index 8636580..e0a4253 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
@@ -86,6 +86,8 @@ public class CCIndexingFilter implements IndexingFilter {
* Add the features represented by a license URL. Urls are of the form
* "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
* license feature.
+ * @param doc a {@link org.apache.nutch.indexer.NutchDocument} to augment
+ * @param urlString the url to extract features from
*/
public void addUrlFeatures(NutchDocument doc, String urlString) {
try {
diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
index 3c9a8b2..ba10432 100644
--- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
+++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
@@ -65,7 +65,15 @@ public class CCParseFilter implements HtmlParseFilter {
this.base = base;
}
- /** Scan the document adding attributes to metadata. */
+ /**
+ * Scan the document adding attributes to metadata.
+ * @param doc the {@link org.w3c.dom.Node} to walk and process
+ * @param base canonical url
+ * @param metadata url {@link org.apache.nutch.metadata.Metadata}
+ * @param conf a populated {@link org.apache.hadoop.conf.Configuration}
+ * @throws ParseException if there is a fatal error or if
+ * <code>creativecommons.exclude.unlicensed</code> is set to true
+ */
public static void walk(Node doc, URL base, Metadata metadata,
Configuration conf) throws ParseException {
@@ -86,7 +94,7 @@ public class CCParseFilter implements HtmlParseFilter {
licenseLocation = "a";
licenseUrl = walker.anchorLicense.toString();
} else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
- throw new ParseException("No CC license. Excluding.");
+ throw new ParseException("No CC license. Excluding.");
}
// add license to metadata
diff --git a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
index d955001..4b446bb 100644
--- a/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
+++ b/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
@@ -81,6 +81,9 @@ public class HeadingsParseFilter implements HtmlParseFilter {
/**
* Finds the specified element and returns its value
+ * @param doc the input {@link org.w3c.dom.DocumentFragment} to process
+ * @param element the element to find in the DocumentFragment
+ * @return a {@link java.util.List} containing headings
*/
protected List<String> getElement(DocumentFragment doc, String element) {
List<String> headings = new ArrayList<>();
@@ -107,6 +110,8 @@ public class HeadingsParseFilter implements HtmlParseFilter {
/**
* Returns the text value of the specified Node and child nodes
+ * @param node the input {@link Node} to extract a value(s) for
+ * @return the whitespace-stripped String node value(s)
*/
protected static String getNodeValue(Node node) {
StringBuilder buffer = new StringBuilder();
diff --git a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
index d42ccdd..1c697a2 100644
--- a/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
+++ b/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
@@ -54,7 +54,12 @@ import com.maxmind.geoip2.record.Traits;
*/
public class GeoIPDocumentCreator {
- /** Add field to document but only if value isn't null */
+ /**
+ * Add field to document but only if value isn't null
+ * @param doc the {@link NutchDocument} to augment
+ * @param name the name of the target field
+ * @param value the String value to associate with the target field
+ */
public static void addIfNotNull(NutchDocument doc, String name,
String value) {
if (value != null) {
@@ -62,7 +67,13 @@ public class GeoIPDocumentCreator {
}
}
- /** Add field to document but only if value isn't null */
+ /**
+ * Add field to document but only if value isn't null
+ * @param doc the {@link NutchDocument} to augment
+ * @param name the name of the target field
+ * @param value the {@link java.lang.Integer} value to
+ * associate with the target field
+ */
public static void addIfNotNull(NutchDocument doc, String name,
Integer value) {
if (value != null) {
diff --git a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
index 3d4f9c5..e2f722c 100644
--- a/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
+++ b/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
@@ -17,10 +17,7 @@
package org.apache.nutch.indexer.metadata;
import java.util.Arrays;
-import java.util.HashMap;
import java.util.HashSet;
-import java.util.Locale;
-import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
@@ -37,7 +34,7 @@ import org.apache.nutch.parse.Parse;
* Indexer which can be configured to extract metadata from the crawldb, parse
* metadata or content metadata. You can specify the properties "index.db.md",
* "index.parse.md" or "index.content.md" who's values are comma-delimited
- * <value>key1,key2,key3</value>.
+ * <code>key1,key2,key3</code>.
*/
public class MetadataIndexer implements IndexingFilter {
private Configuration conf;
diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
index 04c5765..ecea343 100644
--- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
+++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/FieldReplacer.java
@@ -108,9 +108,13 @@ public class FieldReplacer {
* Field replacer with the input and output field the same.
*
* @param fieldName
+ * the name of the source field to operate on. Required.
* @param pattern
+ * the pattern the field must match. Required.
* @param replacement
+ * the replacement string
* @param flags
+ * the Integer flags value, or null if no flags are needed
*/
public FieldReplacer(String fieldName, String pattern, String replacement,
Integer flags) {
@@ -136,7 +140,7 @@ public class FieldReplacer {
/**
* Does this FieldReplacer have a valid fieldname and pattern?
*
- * @return
+ * @return true if fieldname and pattern are valid, false otherwise
*/
public boolean isValid() {
return this.isValid;
@@ -152,8 +156,8 @@ public class FieldReplacer {
* not different then eiher the pattern didn't match or the replacement was a
* no-op.
*
- * @param value
- * @return
+ * @param value the value to replace
+ * @return the replaced value
*/
public String replace(String value) {
if (this.isValid) {
diff --git a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
index 52d5c11..bd68dd1 100644
--- a/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
+++ b/src/plugin/index-static/src/java/org/apache/nutch/indexer/staticfield/StaticFieldIndexer.java
@@ -123,6 +123,8 @@ public class StaticFieldIndexer implements IndexingFilter {
/**
* Escapes any character that needs escaping so it can be used in a regexp.
+ * @param in input string to escape-process
+ * @return the escaped string which can be used un regex operations
*/
protected String regexEscape(String in) {
String result = in;
diff --git a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
index 0d874e8..1c025e0 100644
--- a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
+++ b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchIndexWriter.java
@@ -347,7 +347,7 @@ public class CloudSearchIndexWriter implements IndexWriter {
/**
* Returns {@link Map} with the specific parameters the IndexWriter instance can take.
*
- * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
+ * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
*/
@Override
public Map<String, Entry<String, Object>> describe() {
diff --git a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
index 3ccf840..252f0e8 100644
--- a/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
+++ b/src/plugin/indexer-cloudsearch/src/java/org/apache/nutch/indexwriter/cloudsearch/CloudSearchUtils.java
@@ -34,16 +34,17 @@ public class CloudSearchUtils {
}
}
- /** Returns a normalised doc ID based on the URL of a document **/
+ /**
+ * Returns a normalised doc ID based on the URL of a document
+ * @param url the document url to obtain an ID for
+ * @return A unique ID for the document. A document ID can contain any
+ * letter or number and the following characters: _ - = # ; : / ? @
+ * &. Document IDs must be at least 1 and no more than 128
+ * characters long.
+ * @see <a href="https://docs.aws.amazon.com/cloudsearch/latest/developerguide/preparing-data.html#creating-document-batches">
+ * creating-document-batches</a>
+ */
public static String getID(String url) {
-
- // the document needs an ID
- // @see
- // http://docs.aws.amazon.com/cloudsearch/latest/developerguide/preparing-data.html#creating-document-batches
- // A unique ID for the document. A document ID can contain any
- // letter or number and the following characters: _ - = # ; : / ? @
- // &. Document IDs must be at least 1 and no more than 128
- // characters long.
byte[] dig = digester.digest(url.getBytes(StandardCharsets.UTF_8));
String ID = Hex.encodeHexString(dig);
// is that even possible?
diff --git a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
index 99c0702..6989feb 100644
--- a/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
+++ b/src/plugin/indexer-csv/src/java/org/apache/nutch/indexwriter/csv/CSVIndexWriter.java
@@ -325,7 +325,7 @@ public class CSVIndexWriter implements IndexWriter {
/**
* Returns {@link Map} with the specific parameters the IndexWriter instance can take.
*
- * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
+ * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
*/
@Override
public Map<String, Map.Entry<String, Object>> describe() {
diff --git a/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java b/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
index 1dfc653..b24aa63 100644
--- a/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
+++ b/src/plugin/indexer-dummy/src/java/org/apache/nutch/indexwriter/dummy/DummyIndexWriter.java
@@ -128,7 +128,7 @@ public class DummyIndexWriter implements IndexWriter {
/**
* Returns {@link Map} with the specific parameters the IndexWriter instance can take.
*
- * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
+ * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
*/
@Override
public Map<String, Map.Entry<String, Object>> describe() {
diff --git a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
index d46dd6a..e81e968 100644
--- a/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
+++ b/src/plugin/indexer-elastic/src/java/org/apache/nutch/indexwriter/elastic/ElasticIndexWriter.java
@@ -17,11 +17,6 @@
package org.apache.nutch.indexwriter.elastic;
import java.lang.invoke.MethodHandles;
-import java.security.KeyManagementException;
-import java.security.KeyStoreException;
-import java.security.NoSuchAlgorithmException;
-import java.security.cert.CertificateException;
-import java.security.cert.X509Certificate;
import java.time.format.DateTimeFormatter;
import java.io.IOException;
import java.util.AbstractMap;
@@ -30,21 +25,14 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
-import javax.net.ssl.SSLContext;
-
import org.apache.commons.lang.StringUtils;
-import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.hadoop.conf.Configuration;
-import org.apache.http.Header;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.nio.client.HttpAsyncClientBuilder;
-import org.apache.http.message.BasicHeader;
-import org.apache.http.ssl.SSLContextBuilder;
-import org.apache.http.ssl.TrustStrategy;
import org.apache.nutch.indexer.IndexWriter;
import org.apache.nutch.indexer.IndexWriterParams;
import org.apache.nutch.indexer.NutchDocument;
@@ -163,6 +151,10 @@ public class ElasticIndexWriter implements IndexWriter {
/**
* Generates a RestHighLevelClient with the hosts given
+ * @param parameters implementation specific {@link org.apache.nutch.indexer.IndexWriterParams}
+ * @return an initialized {@link org.elasticsearch.client.RestHighLevelClient}
+ * @throws IOException if there is an error reading the
+ * {@link org.apache.nutch.indexer.IndexWriterParams}
*/
protected RestHighLevelClient makeClient(IndexWriterParams parameters)
throws IOException {
@@ -207,6 +199,7 @@ public class ElasticIndexWriter implements IndexWriter {
/**
* Generates a default BulkProcessor.Listener
+ * @return {@link BulkProcessor.Listener}
*/
protected BulkProcessor.Listener bulkProcessorListener() {
return new BulkProcessor.Listener() {
@@ -297,7 +290,7 @@ public class ElasticIndexWriter implements IndexWriter {
* can take.
*
* @return The values of each row. It must have the form
- * <KEY,<DESCRIPTION,VALUE>>.
+ * <KEY,<DESCRIPTION,VALUE>>.
*/
@Override
public Map<String, Map.Entry<String, Object>> describe() {
diff --git a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
index 616ee8d..0ce5cbc 100644
--- a/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
+++ b/src/plugin/indexer-rabbit/src/java/org/apache/nutch/indexwriter/rabbit/RabbitIndexWriter.java
@@ -214,7 +214,7 @@ public class RabbitIndexWriter implements IndexWriter {
/**
* Returns {@link Map} with the specific parameters the IndexWriter instance can take.
*
- * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
+ * @return The values of each row. It must have the form <KEY,<DESCRIPTION,VALUE>>.
*/
@Override
public Map<String, Map.Entry<String, Object>> describe() {
diff --git a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
index 3b03e7d..04c08b3 100644
--- a/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
+++ b/src/plugin/indexer-solr/src/java/org/apache/nutch/indexwriter/solr/SolrIndexWriter.java
@@ -297,8 +297,7 @@ public class SolrIndexWriter implements IndexWriter {
* Returns {@link Map} with the specific parameters the IndexWriter instance
* can take.
*
- * @return The values of each row. It must have the form
- * <KEY,<DESCRIPTION,VALUE>>.
+ * @return The values of each row. It must have the form <code><KEY,<DESCRIPTION,VALUE>></code>.
*/
@Override
public Map<String, Entry<String, Object>> describe() {
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 8e96a26..58dfbfe 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -47,7 +47,6 @@ import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.DeflateUtils;
-import org.apache.nutch.util.URLUtil;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.conf.Configuration;
@@ -192,7 +191,11 @@ public abstract class HttpBase implements Protocol {
this(null);
}
- /** Creates a new instance of HttpBase */
+ /**
+ * Creates a new instance of HttpBase
+ * @param logger the {@link org.slf4j.Logger} to use
+ * in this HttpBase
+ */
public HttpBase(Logger logger) {
if (logger != null) {
this.logger = logger;
@@ -200,7 +203,6 @@ public abstract class HttpBase implements Protocol {
robots = new HttpRobotRulesParser();
}
- // Inherited Javadoc
public void setConf(Configuration conf) {
this.conf = conf;
this.proxyHost = conf.get("http.proxy.host");
@@ -371,7 +373,6 @@ public abstract class HttpBase implements Protocol {
logConf();
}
- // Inherited Javadoc
public Configuration getConf() {
return this.conf;
}
@@ -526,6 +527,7 @@ public abstract class HttpBase implements Protocol {
/**
* The time limit to download the entire content, in seconds. See the property
* <code>http.time.limit</code>.
+ * @return the maximum duration
*/
public int getMaxDuration() {
return maxDuration;
@@ -534,6 +536,7 @@ public abstract class HttpBase implements Protocol {
/**
* Whether to save partial fetches as truncated content, cf. the property
* <code>http.partial.truncated</code>.
+ * @return true if partially fetched truncated content is stored
*/
public boolean isStorePartialAsTruncated() {
return partialAsTruncated;
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
index f761bd0..c3371ad 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
@@ -55,7 +55,11 @@ public class HttpRobotRulesParser extends RobotRulesParser {
allowForbidden = conf.getBoolean("http.robots.403.allow", true);
}
- /** Compose unique key to store and access robot rules in cache for given URL */
+ /**
+ * Compose unique key to store and access robot rules in cache for given URL
+ * @param url to generate a unique key for
+ * @return the cached unique key
+ */
protected static String getCacheKey(URL url) {
String protocol = url.getProtocol().toLowerCase(); // normalize to lower
// case
diff --git a/src/plugin/lib-rabbitmq/src/java/org/apache/nutch/rabbitmq/RabbitMQClient.java b/src/plugin/lib-rabbitmq/src/java/org/apache/nutch/rabbitmq/RabbitMQClient.java
index 9096158..1d4f9be 100644
--- a/src/plugin/lib-rabbitmq/src/java/org/apache/nutch/rabbitmq/RabbitMQClient.java
+++ b/src/plugin/lib-rabbitmq/src/java/org/apache/nutch/rabbitmq/RabbitMQClient.java
@@ -109,7 +109,7 @@ public class RabbitMQClient {
*
* @param exchangeName The exchange's name.
* @param exchangeOptions Options used when the exchange is created.
- * <br />
+ * <p>
* It must have the form type={type},durable={durable} where:
* <ul>
* <li>{type} is fanout, direct, headers or topic</li>
@@ -117,7 +117,7 @@ public class RabbitMQClient {
* </ul>
* @param queueName The queue's name.
* @param queueOptions Options used when the queue is created.
- * <br />
+ * <p>
* It must have the form durable={type},exclusive={durable},auto-delete={durable},arguments={durable} where:
* <ul>
* <li>durable is true or false</li>
@@ -127,7 +127,7 @@ public class RabbitMQClient {
* </ul>
* @param bindingKey The routine key to use for the binding.
* @param bindingArguments This parameter is only used when the exchange's type is headers. In other cases is ignored.
- * <br />
+ * <p>
* It must have the form key1=value1,key2=value2
* @throws IOException If there is some issue creating the relationship.
*/
diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
index 2cf6dc1..af54c00 100644
--- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
+++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java
@@ -30,13 +30,12 @@ import java.util.ArrayList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLFilter;
import org.apache.nutch.util.URLUtil;
/**
- * Generic {@link org.apache.nutch.net.URLFilter URL filter} based on regular
+ * Generic {@link org.apache.nutch.net.URLFilter} based on regular
* expressions.
*
* <p>
@@ -87,6 +86,10 @@ public abstract class RegexURLFilterBase implements URLFilter {
*
* @param filename
* is the name of rules file.
+ * @throws IOException if there is a fatal I/O error interpreting the input
+ * {@link File}
+ * @throws IllegalArgumentException if there is a fatal error processing the regex
+ * rules wiuthin the {@link org.apache.nutch.net.URLFilter}
*/
public RegexURLFilterBase(File filename) throws IOException,
IllegalArgumentException {
@@ -98,8 +101,10 @@ public abstract class RegexURLFilterBase implements URLFilter {
*
* @param rules
* string with a list of rules, one rule per line
- * @throws IOException
- * @throws IllegalArgumentException
+ * @throws IOException if there is a fatal I/O error interpreting the input
+ * rules
+ * @throws IllegalArgumentException if there is a fatal error processing the regex
+ * rules wiuthin the {@link org.apache.nutch.net.URLFilter}
*/
public RegexURLFilterBase(String rules) throws IOException,
IllegalArgumentException {
@@ -111,6 +116,10 @@ public abstract class RegexURLFilterBase implements URLFilter {
*
* @param reader
* is a reader of rules.
+ * @throws IOException if there is a fatal I/O error interpreting the input
+ * {@link Reader}
+ * @throws IllegalArgumentException if there is a fatal error processing the regex
+ * rules wiuthin the {@link org.apache.nutch.net.URLFilter}
*/
protected RegexURLFilterBase(Reader reader) throws IOException,
IllegalArgumentException {
@@ -127,6 +136,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
* must be excluded.
* @param regex
* is the regular expression associated to this rule.
+ * @return {@link RegexRule}
*/
protected abstract RegexRule createRule(boolean sign, String regex);
@@ -141,6 +151,7 @@ public abstract class RegexURLFilterBase implements URLFilter {
* is the regular expression associated to this rule.
* @param hostOrDomain
* the host or domain to which this regex belongs
+ * @return {@link RegexRule}
*/
protected abstract RegexRule createRule(boolean sign, String regex, String hostOrDomain);
@@ -151,16 +162,12 @@ public abstract class RegexURLFilterBase implements URLFilter {
* @param conf
* is the current configuration.
* @return the name of the resource containing the rules to use.
+ * @throws IOException if there is a fatal error obtaining the
+ * {@link Reader}
*/
protected abstract Reader getRulesReader(Configuration conf)
throws IOException;
- /*
- * -------------------------- * <implementation:URLFilter> *
- * --------------------------
- */
-
- // Inherited Javadoc
public String filter(String url) {
String host = null;
String domain = null;
@@ -198,16 +205,6 @@ public abstract class RegexURLFilterBase implements URLFilter {
return null;
}
- /*
- * --------------------------- * </implementation:URLFilter> *
- * ---------------------------
- */
-
- /*
- * ----------------------------- * <implementation:Configurable> *
- * -----------------------------
- */
-
public void setConf(Configuration conf) {
this.conf = conf;
Reader reader = null;
@@ -233,11 +230,6 @@ public abstract class RegexURLFilterBase implements URLFilter {
return this.conf;
}
- /*
- * ------------------------------ * </implementation:Configurable> *
- * ------------------------------
- */
-
/**
* Read the specified file of rules.
*
@@ -298,6 +290,10 @@ public abstract class RegexURLFilterBase implements URLFilter {
* is the RegexURLFilterBase to use for filtering the standard input.
* @param args
* some optional parameters (not used).
+ * @throws IOException if there is a fatal I/O error interpreting the input
+ * arguments
+ * @throws IllegalArgumentException if there is a fatal error processing the
+ * input arguments
*/
public static void main(RegexURLFilterBase filter, String args[])
throws IOException, IllegalArgumentException {
diff --git a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
index aa370ea..e98f1b5 100644
--- a/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
+++ b/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java
@@ -85,7 +85,6 @@ public class MimeTypeIndexingFilter implements IndexingFilter {
private boolean acceptMode = true;
- // Inherited JavaDoc
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
@@ -132,11 +131,6 @@ public class MimeTypeIndexingFilter implements IndexingFilter {
return doc;
}
- /*
- * -----------------------------
- * <implementation:Configurable> *
- * -----------------------------
- */
@Override
public void setConf(Configuration conf) {
this.conf = conf;
@@ -204,9 +198,9 @@ public class MimeTypeIndexingFilter implements IndexingFilter {
/**
* Main method for invoking this tool
- *
- * @throws IOException
- * @throws IndexingException
+ * @param args run with no arguments to print help
+ * @throws IOException if there is a fatal I/O error processing the input args
+ * @throws IndexingException if there is a fatal error whils indexing
*/
public static void main(String[] args) throws IOException, IndexingException {
Option helpOpt = new Option("h", "help", false, "show this help message");
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
index 62b7b6d..6b98edf 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java
@@ -130,6 +130,8 @@ public class DOMBuilder implements ContentHandler, LexicalHandler {
*
* @param newNode
* New node to append
+ * @throws org.xml.sax.SAXException if text is found before
+ * the document element
*/
protected void append(Node newNode) throws org.xml.sax.SAXException {
@@ -427,6 +429,8 @@ public class DOMBuilder implements ContentHandler, LexicalHandler {
* Index to start of characters in the array
* @param length
* Number of characters in the array
+ * @throws org.xml.sax.SAXException if text is found before
+ * the document element
*/
public void charactersRaw(char ch[], int start, int length)
throws org.xml.sax.SAXException {
@@ -467,6 +471,8 @@ public class DOMBuilder implements ContentHandler, LexicalHandler {
* @param name
* The name of the entity that is ending.
* @see #startEntity
+ * @throws org.xml.sax.SAXException if text is found before
+ * the document element
*/
public void endEntity(String name) throws org.xml.sax.SAXException {
}
@@ -476,6 +482,8 @@ public class DOMBuilder implements ContentHandler, LexicalHandler {
*
* @param name
* name of the entity reference
+ * @throws org.xml.sax.SAXException if text is found before
+ * the document element
*/
public void entityReference(String name) throws org.xml.sax.SAXException {
append(m_doc.createEntityReference(name));
@@ -629,6 +637,8 @@ public class DOMBuilder implements ContentHandler, LexicalHandler {
* The number of characters to read from the array.
* @see #ignorableWhitespace
* @see org.xml.sax.Locator
+ * @throws org.xml.sax.SAXException if text is found before
+ * the document element
*/
public void cdata(char ch[], int start, int length)
throws org.xml.sax.SAXException {
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
index a9aa0e4..2415e85 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java
@@ -117,6 +117,11 @@ public class DOMContentUtils {
*
* <p>
*
+ * @param sb a {@link StringBuffer} used to store content text
+ * found beneath the DOM node... if any exists
+ * @param node a DOM {@link Node} to check for content text
+ * @param abortOnNestedAnchors true to abort if nested anchors
+ * are encountered, false otherwise
* @return true if nested anchors were found
*/
public boolean getText(StringBuffer sb, Node node,
@@ -130,7 +135,9 @@ public class DOMContentUtils {
/**
* This is a convinience method, equivalent to
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
- *
+ * @param sb a {@link StringBuffer} used to store content text
+ * found beneath the DOM node... if any exists
+ * @param node a DOM {@link Node} to check for content text
*/
public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
@@ -235,7 +242,9 @@ public class DOMContentUtils {
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
- *
+ * @param sb a {@link StringBuffer} used to store content text
+ * found beneath the DOM node... if any exists
+ * @param node a DOM {@link Node} to check for content text
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
@@ -263,7 +272,11 @@ public class DOMContentUtils {
return false;
}
- /** If Node contains a BASE tag then it's HREF is returned. */
+ /**
+ * If Node contains a BASE tag then it's HREF is returned.
+ * @param node a DOM {@link Node} to check for a BASE tag
+ * @return HREF if one exists
+ */
public String getBase(Node node) {
NodeWalker walker = new NodeWalker(node);
@@ -370,6 +383,11 @@ public class DOMContentUtils {
* Links without inner structure (tags, text, etc) are discarded, as are links
* which contain only single nested links and empty text nodes (this is a
* common DOM-fixup artifact, at least with nekohtml).
+ *
+ * @param base the canonical {@link URL}
+ * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated
+ * with the base URL
+ * @param node a {@link Node} under which to discover anchors
*/
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
index d655a96..e96c5a5 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -40,6 +40,10 @@ public class HTMLMetaProcessor {
/**
* Sets the indicators in <code>robotsMeta</code> to appropriate values, based
* on any META tags found under the given <code>node</code>.
+ * @param metaTags a {@link HTMLMetaTags} to populate with tags discovered in the
+ * given Node
+ * @param node a DOM {@link Node} to process and extract metadata from
+ * @param currURL the cononical URL associated with the metatags and Node
*/
public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
URL currURL) {
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
index d2bb42e..e6527e2 100644
--- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
+++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -278,8 +278,8 @@ public class JSParseFilter implements HtmlParseFilter, Parser {
* method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js
* baseURL
*
- * @param args
- * @throws Exception
+ * @param args run with no args to get help
+ * @throws Exception if there is a fatal error running the class with the given input
*/
public static void main(String[] args) throws Exception {
if (args.length < 2) {
diff --git a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
index bcb8c36..60136d7 100644
--- a/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
+++ b/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java
@@ -146,7 +146,9 @@ public class SWFParser implements Parser {
}
/**
- * Arguments are: 0. Name of input SWF file.
+ * @param args arguments are: 0. Name of input SWF file.
+ * @throws IOException if there is a fatal error processing the input
+ * file
*/
public static void main(String[] args) throws IOException {
FileInputStream in = new FileInputStream(args[0]);
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
index 9948136..a989082 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java
@@ -134,7 +134,9 @@ public class DOMContentUtils {
/**
* This is a convinience method, equivalent to
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
- *
+ * @param sb a {@link StringBuffer} used to store content text
+ * found beneath the DOM node... if any exists
+ * @param node a DOM {@link Node} to check for content text
*/
public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
@@ -239,7 +241,9 @@ public class DOMContentUtils {
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
- *
+ * @param sb a {@link StringBuffer} used to store content text
+ * found beneath the DOM node... if any exists
+ * @param node a DOM {@link Node} to check for content text
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
@@ -267,7 +271,11 @@ public class DOMContentUtils {
return false;
}
- /** If Node contains a BASE tag then it's HREF is returned. */
+ /**
+ * If Node contains a BASE tag then it's HREF is returned.
+ * @param node a DOM {@link Node} to check for a BASE tag
+ * @return HREF if one exists
+ * */
public String getBase(Node node) {
NodeWalker walker = new NodeWalker(node);
@@ -374,6 +382,11 @@ public class DOMContentUtils {
* Links without inner structure (tags, text, etc) are discarded, as are links
* which contain only single nested links and empty text nodes (this is a
* common DOM-fixup artifact, at least with nekohtml).
+ *
+ * @param base the canonical {@link URL}
+ * @param outlinks the {@link ArrayList} of {@link Outlink}'s associated
+ * with the base URL
+ * @param node a {@link Node} under which to discover anchors
*/
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 8584df7..c9550d8 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -42,6 +42,10 @@ public class HTMLMetaProcessor {
/**
* Sets the indicators in <code>robotsMeta</code> to appropriate values, based
* on any META tags found under the given <code>node</code>.
+ * @param metaTags a {@link HTMLMetaTags} to populate with tags discovered in the
+ * given Node
+ * @param node a DOM {@link Node} to process and extract metadata from
+ * @param currURL the cononical URL associated with the metatags and Node
*/
public static final void getMetaTags(HTMLMetaTags metaTags, Node node,
URL currURL) {
diff --git a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
index 019c2e3..1e4e4f3 100644
--- a/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
+++ b/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java
@@ -50,7 +50,10 @@ public class ZipTextExtractor {
private Configuration conf;
- /** Creates a new instance of ZipTextExtractor */
+ /**
+ * Creates a new instance of ZipTextExtractor
+ * @param conf a populated {@link Configuration}
+ */
public ZipTextExtractor(Configuration conf) {
this.conf = conf;
}
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
index 8b613e1..4120cbb 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
@@ -87,6 +87,7 @@ public class File implements Protocol {
/**
* Set the length after at which content is truncated.
+ * @param maxContentLength max content in bytes
*/
public void setMaxContentLength(int maxContentLength) {
this.maxContentLength = maxContentLength;
@@ -160,6 +161,9 @@ public class File implements Protocol {
/**
* Quick way for running this class. Useful for debugging.
+ * @param args run with no args to print help
+ * @throws Exception if there is a fatal error running this class
+ * with the given input
*/
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
index 0579d96..803557f 100644
--- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
+++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
@@ -22,7 +22,6 @@ import java.io.UnsupportedEncodingException;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
@@ -31,7 +30,7 @@ import org.apache.tika.Tika;
import org.apache.hadoop.conf.Configuration;
-/************************************
+/**
* FileResponse.java mimics file replies as http response. It tries its best to
* follow http's way for headers, response codes as well as exceptions.
*
@@ -53,7 +52,7 @@ import org.apache.hadoop.conf.Configuration;
* (4) No funcy POSIX file attributes yet. May never need?
*
* @author John Xing
- ***********************************/
+ */
public class FileResponse {
private String orig;
@@ -68,12 +67,19 @@ public class FileResponse {
private Tika tika;
- /** Returns the response code. */
+ /**
+ * Get the response code.
+ * @return the int response code
+ */
public int getCode() {
return code;
}
- /** Returns the value of a named header. */
+ /**
+ * Returns the value of a named header.
+ * @param name header key to retrieve a value for
+ * @return the header value
+ */
public String getHeader(String name) {
return headers.get(name);
}
@@ -90,12 +96,12 @@ public class FileResponse {
/**
* Default public constructor
*
- * @param url
- * @param datum
- * @param file
- * @param conf
- * @throws FileException
- * @throws IOException
+ * @param url the canonical URL associated with the response
+ * @param datum crawl information for the URL
+ * @param file the actual File containing content for the url
+ * @param conf a populated {@link Configuration}
+ * @throws FileException if the input file does not use file protocol
+ * @throws IOException if there is a fatal I/O error obtaining the input file
*/
public FileResponse(URL url, CrawlDatum datum, File file, Configuration conf)
throws FileException, IOException {
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
index 1c48ab3..e23ad99 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java
@@ -37,7 +37,7 @@ import org.apache.commons.net.ftp.FTPReply;
import org.apache.commons.net.ftp.FTPConnectionClosedException;
-/***********************************************
+/**
* Client.java encapsulates functionalities necessary for nutch to get dir list
* and retrieve file from an FTP server. This class takes care of all low level
* details of interacting with an FTP server and provides a convenient higher
@@ -64,7 +64,7 @@ import org.apache.commons.net.ftp.FTPConnectionClosedException;
* IOException.
*
* @author John Xing
- ***********************************************/
+ */
public class Client extends FTP {
private int __dataTimeout;
@@ -136,13 +136,14 @@ public class Client extends FTP {
}
/**
- * open a passive data connection socket
+ * Open a passive data connection socket
*
- * @param command
- * @param arg
- * @return
- * @throws IOException
- * @throws FtpExceptionCanNotHaveDataConnection
+ * @param command the FTP command to be sent to the FTP server
+ * @param arg the argument associated with the command
+ * @return a passive {@link Socket} connections
+ * @throws IOException if there is an error entering passive mode
+ * @throws FtpExceptionCanNotHaveDataConnection can occur if there is a
+ * malformed server reply
*/
protected Socket __openPassiveDataConnection(int command, String arg)
throws IOException, FtpExceptionCanNotHaveDataConnection {
@@ -213,6 +214,7 @@ public class Client extends FTP {
/***
* Sets the timeout in milliseconds to use for data connection. set
* immediately after opening the data connection.
+ * @param timeout maximum timeout in milliseconds
***/
public void setDataTimeout(int timeout) {
__dataTimeout = timeout;
@@ -310,16 +312,23 @@ public class Client extends FTP {
}
/**
- * retrieve list reply for path
+ * Retrieve list reply for path
*
- * @param path
- * @param entries
- * @param limit
- * @param parser
- * @throws IOException
- * @throws FtpExceptionCanNotHaveDataConnection
- * @throws FtpExceptionUnknownForcedDataClose
- * @throws FtpExceptionControlClosedByForcedDataClose
+ * @param path a path on the FTP server
+ * @param entries a initialized {@link List} of
+ * {@link FTPFile}'s to populate with entries found at the path
+ * @param limit optionally impose a download limit if this value
+ * is >= 0, otherwise no limit
+ * @param parser a configured {@link FTPFileEntryParser}
+ * @throws IOException if there is a fatal I/O error, could be related to
+ * opening a passive data connection or retrieving data from the specified path
+ * @throws FtpExceptionCanNotHaveDataConnection if an error occurs whilst
+ * opening a passive data connection
+ * @throws FtpExceptionUnknownForcedDataClose if there is a bad reply from the
+ * FTP server
+ * @throws FtpExceptionControlClosedByForcedDataClose some ftp servers will
+ * close control channel if data channel socket is closed by our end before
+ * all data has been read out
*/
public void retrieveList(String path, List<FTPFile> entries, int limit,
FTPFileEntryParser parser) throws IOException,
@@ -387,13 +396,19 @@ public class Client extends FTP {
/**
* retrieve file for path
*
- * @param path
- * @param os
- * @param limit
- * @throws IOException
- * @throws FtpExceptionCanNotHaveDataConnection
- * @throws FtpExceptionUnknownForcedDataClose
- * @throws FtpExceptionControlClosedByForcedDataClose
+ * @param path a path on the FTP server
+ * @param os an {@link OutputStream} to write data to
+ * @param limit optionally impose a download limit if this value
+ * is >= 0, otherwise no limit
+ * @throws IOException if there is a fatal I/O error, could be related to
+ * opening a passive data connection or retrieving data from the specified path
+ * @throws FtpExceptionCanNotHaveDataConnection if an error occurs whilst
+ * opening a passive data connection
+ * @throws FtpExceptionUnknownForcedDataClose if there is a bad reply from the
+ * FTP server
+ * @throws FtpExceptionControlClosedByForcedDataClose some ftp servers will
+ * close control channel if data channel socket is closed by our end before
+ * all data has been read out
*/
public void retrieveFile(String path, OutputStream os, int limit)
throws IOException, FtpExceptionCanNotHaveDataConnection,
@@ -537,11 +552,7 @@ public class Client extends FTP {
*
* @return The system type name obtained from the server. null if the
* information could not be obtained.
- * @exception FTPConnectionClosedException
- * If the FTP server prematurely closes the connection as a
- * result of the client being idle or some other reason causing
- * the server to send FTP reply code 421. This exception may be
- * caught either as an IOException or independently as itself.
+ * @exception FtpExceptionBadSystResponse indicating bad reply of SYST command
* @exception IOException
* If an I/O error occurs while either sending a command to the
* server or receiving a reply from the server.
@@ -580,13 +591,4 @@ public class Client extends FTP {
return FTPReply.isPositiveCompletion(noop());
}
- // client.stat(path);
- // client.sendCommand("STAT");
- // client.sendCommand("STAT",path);
- // client.sendCommand("MDTM",path);
- // client.sendCommand("SIZE",path);
- // client.sendCommand("HELP","SITE");
- // client.sendCommand("SYST");
- // client.setRestartOffset(120);
-
}
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
index 3da83bd..470e151 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -89,22 +89,42 @@ public class Ftp implements Protocol {
robots = new FtpRobotRulesParser();
}
- /** Set the timeout. */
+ /**
+ * Set the timeout.
+ * @param to a maximum timeout in milliseconds
+ */
public void setTimeout(int to) {
timeout = to;
}
- /** Set the point at which content is truncated. */
+ /**
+ * Set the length after at which content is truncated.
+ * @param length max content length in bytes
+ */
public void setMaxContentLength(int length) {
maxContentLength = length;
}
- /** Set followTalk */
+ /**
+ * Set followTalk i.e. to log dialogue between our client and remote
+ * server. Useful for debugging.
+ * @param followTalk if true will follow, false by default
+ */
public void setFollowTalk(boolean followTalk) {
this.followTalk = followTalk;
}
- /** Set keepConnection */
+ /**
+ * Whether to keep ftp connection. Useful if crawling same host
+ * again and again. When set to true, it avoids connection, login and dir list
+ * parser setup for subsequent URLs. If it is set to true, however, you must
+ * make sure (roughly):
+ * (1) ftp.timeout is less than ftp.server.timeout
+ * (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
+ * Otherwise there will be too many "delete client because idled too long"
+ * messages in thread logs.
+ * @param keepConnection if true we will keep the connection, false by default
+ */
public void setKeepConnection(boolean keepConnection) {
this.keepConnection = keepConnection;
}
@@ -177,7 +197,11 @@ public class Ftp implements Protocol {
}
}
- /** For debugging. */
+ /**
+ * For debugging.
+ * @param args run with no args for help
+ * @throws Exception if there is an error running this program
+ */
public static void main(String[] args) throws Exception {
int timeout = Integer.MIN_VALUE;
int maxContentLength = Integer.MIN_VALUE;
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
index aee44b5..20289e7 100644
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -55,12 +55,19 @@ public class FtpResponse {
private final Ftp ftp;
private Configuration conf;
- /** Returns the response code. */
+ /**
+ * Get the response code.
+ * @return the int response code
+ */
public int getCode() {
return code;
}
- /** Returns the value of a named header. */
+ /**
+ * Returns the value of a named header.
+ * @param name header key to retrieve a value for
+ * @return the header value
+ */
public String getHeader(String name) {
return headers.get(name);
}
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java
index 6092e78..e3521ea 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/DummyX509TrustManager.java
@@ -34,6 +34,12 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* Constructor for DummyX509TrustManager.
+ * @param keystore a initialized {@link java.security.KeyStore}
+ * @throws NoSuchAlgorithmException if there is a fatal error obtaining a
+ * {@link javax.net.ssl.TrustManagerFactory} for the default algorithm.
+ * @see TrustManagerFactory#getDefaultAlgorithm()
+ * @throws KeyStoreException if there is a fatal error initializing the
+ * {@link javax.net.ssl.TrustManagerFactory} with the provided keystore.
*/
public DummyX509TrustManager(KeyStore keystore)
throws NoSuchAlgorithmException, KeyStoreException {
@@ -51,6 +57,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isClientTrusted(X509Certificate[] certificates) {
return true;
@@ -59,6 +67,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isServerTrusted(X509Certificate[] certificates) {
return true;
@@ -66,6 +76,7 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ * @return a {@link java.security.cert.X509Certificate} array
*/
public X509Certificate[] getAcceptedIssuers() {
return this.standardTrustManager.getAcceptedIssuers();
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
index b82880d..b093e5c 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -46,7 +46,7 @@ public class Http extends HttpBase {
/**
* Set the {@link org.apache.hadoop.conf.Configuration} object.
*
- * @param conf
+ * @param conf a popultaed {@link Configuration}
*/
public void setConf(Configuration conf) {
super.setConf(conf);
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index ced2e0f..58e809a 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -73,11 +73,12 @@ public class HttpResponse implements Response {
/**
* Default public constructor.
*
- * @param http
- * @param url
- * @param datum
- * @throws ProtocolException
- * @throws IOException
+ * @param http a initialized {@link HttpBase} record for the given url
+ * @param url the canonical url
+ * @param datum the {@link org.apache.nutch.crawl.CrawlDatum} for the url
+ * @throws ProtocolException if the protocol scheme is http/https
+ * @throws IOException if a fatal error occurs in operating the
+ * {@link SSLSocket} or {@link Socket}
*/
public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
index c92f7d2..5896007 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/DummyX509TrustManager.java
@@ -41,6 +41,12 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* Constructor for DummyX509TrustManager.
+ * @param keystore a initialized {@link java.security.KeyStore}
+ * @throws NoSuchAlgorithmException if there is a fatal error obtaining a
+ * {@link javax.net.ssl.TrustManagerFactory} for the default algorithm.
+ * @see TrustManagerFactory#getDefaultAlgorithm()
+ * @throws KeyStoreException if there is a fatal error initializing the
+ * {@link javax.net.ssl.TrustManagerFactory} with the provided keystore.
*/
public DummyX509TrustManager(KeyStore keystore)
throws NoSuchAlgorithmException, KeyStoreException {
@@ -58,6 +64,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isClientTrusted(X509Certificate[] certificates) {
return true;
@@ -66,6 +74,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isServerTrusted(X509Certificate[] certificates) {
return true;
@@ -73,6 +83,7 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ * @return a {@link java.security.cert.X509Certificate} array
*/
public X509Certificate[] getAcceptedIssuers() {
return this.standardTrustManager.getAcceptedIssuers();
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
index 6c7a7be..b85c47a 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
@@ -46,7 +46,7 @@ public class Http extends HttpBase {
/**
* Set the {@link org.apache.hadoop.conf.Configuration} object.
*
- * @param conf
+ * @param conf a populated {@link Configuration}
*/
public void setConf(Configuration conf) {
super.setConf(conf);
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 2d75b1c..5228f33 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -66,11 +66,12 @@ public class HttpResponse implements Response {
/**
* Default public constructor.
*
- * @param http
- * @param url
- * @param datum
- * @throws ProtocolException
- * @throws IOException
+ * @param http the {@link HttpBase} for this URL
+ * @param url the canonical URL associated with the response
+ * @param datum crawl information for the URL
+ * @throws ProtocolException if the URL does not use HTTP protocol
+ * @throws IOException if there is a fatal I/O error, typically to do
+ * with Socket's
*/
public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
throws ProtocolException, IOException {
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
index 3188092..1bb7cf1 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/DummyX509TrustManager.java
@@ -34,6 +34,12 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* Constructor for DummyX509TrustManager.
+ * @param keystore a initialized {@link java.security.KeyStore}
+ * @throws NoSuchAlgorithmException if there is a fatal error obtaining a
+ * {@link javax.net.ssl.TrustManagerFactory} for the default algorithm.
+ * @see TrustManagerFactory#getDefaultAlgorithm()
+ * @throws KeyStoreException if there is a fatal error initializing the
+ * {@link javax.net.ssl.TrustManagerFactory} with the provided keystore.
*/
public DummyX509TrustManager(KeyStore keystore)
throws NoSuchAlgorithmException, KeyStoreException {
@@ -51,6 +57,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isClientTrusted(X509Certificate[] certificates) {
return true;
@@ -59,6 +67,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isServerTrusted(X509Certificate[] certificates) {
return true;
@@ -66,6 +76,7 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ * @return a {@link java.security.cert.X509Certificate} array
*/
public X509Certificate[] getAcceptedIssuers() {
return this.standardTrustManager.getAcceptedIssuers();
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
index cd188fb..2247f5e 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java
@@ -153,6 +153,8 @@ public class Http extends HttpBase {
*
* @param args
* Command line arguments
+ * @throws Exception if a fatal error is encountered whilst running
+ * the program
*/
public static void main(String[] args) throws Exception {
Http http = new Http();
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
index 277313c..a0a255b 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java
@@ -64,6 +64,9 @@ public class HttpBasicAuthentication implements HttpAuthentication,
*
* @param challenge
* WWW-Authenticate header from web server
+ * @param conf a populated {@link Configuration}
+ * @throws HttpAuthenticationException if the authentication fails or if the
+ * password or username is null
*/
protected HttpBasicAuthentication(String challenge, Configuration conf)
throws HttpAuthenticationException {
@@ -155,6 +158,7 @@ public class HttpBasicAuthentication implements HttpAuthentication,
* The challenge string provided by the webserver. This is the text
* which follows the WWW-Authenticate header, including the Basic
* tag.
+ * @param conf a populated {@link Configuration}
* @return An HttpBasicAuthentication object or null if unable to generate
* appropriate credentials.
*/
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java
index ec1354f..9e3afdc 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/DummyX509TrustManager.java
@@ -34,6 +34,12 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* Constructor for DummyX509TrustManager.
+ * @param keystore a initialized {@link java.security.KeyStore}
+ * @throws NoSuchAlgorithmException if there is a fatal error obtaining a
+ * {@link javax.net.ssl.TrustManagerFactory} for the default algorithm.
+ * @see TrustManagerFactory#getDefaultAlgorithm()
+ * @throws KeyStoreException if there is a fatal error initializing the
+ * {@link javax.net.ssl.TrustManagerFactory} with the provided keystore.
*/
public DummyX509TrustManager(KeyStore keystore)
throws NoSuchAlgorithmException, KeyStoreException {
@@ -51,6 +57,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isClientTrusted(X509Certificate[] certificates) {
return true;
@@ -59,6 +67,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isServerTrusted(X509Certificate[] certificates) {
return true;
@@ -66,6 +76,7 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ * @return a {@link java.security.cert.X509Certificate} array
*/
public X509Certificate[] getAcceptedIssuers() {
return this.standardTrustManager.getAcceptedIssuers();
diff --git a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
index f670d5f..a4b3761 100644
--- a/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
+++ b/src/plugin/protocol-interactiveselenium/src/java/org/apache/nutch/protocol/interactiveselenium/handlers/DefaultClickAllAjaxLinksHandler.java
@@ -31,7 +31,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * This handler clicks all the <a hfer="javascript:void(null);"> tags
+ * This handler clicks all the <code><a href="javascript:void(null);"></a></code> tags
* because it considers them as not usual links but ajax links/interactions. This uses the same logic of
* DefalultMultiInteractionHandler.
*/
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java
index 1eea806..03ac204 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/DummyX509TrustManager.java
@@ -34,6 +34,12 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* Constructor for DummyX509TrustManager.
+ * @param keystore a initialized {@link java.security.KeyStore}
+ * @throws NoSuchAlgorithmException if there is a fatal error obtaining a
+ * {@link javax.net.ssl.TrustManagerFactory} for the default algorithm.
+ * @see TrustManagerFactory#getDefaultAlgorithm()
+ * @throws KeyStoreException if there is a fatal error initializing the
+ * {@link javax.net.ssl.TrustManagerFactory} with the provided keystore.
*/
public DummyX509TrustManager(KeyStore keystore)
throws NoSuchAlgorithmException, KeyStoreException {
@@ -51,6 +57,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkClientTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isClientTrusted(X509Certificate[] certificates) {
return true;
@@ -59,6 +67,8 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#checkServerTrusted(X509Certificate[],
* String)
+ * @param certificates a {@link java.security.cert.X509Certificate} array
+ * @return true if trusted, false otherwise
*/
public boolean isServerTrusted(X509Certificate[] certificates) {
return true;
@@ -66,6 +76,7 @@ public class DummyX509TrustManager implements X509TrustManager {
/**
* @see javax.net.ssl.X509TrustManager#getAcceptedIssuers()
+ * @return a {@link java.security.cert.X509Certificate} array
*/
public X509Certificate[] getAcceptedIssuers() {
return this.standardTrustManager.getAcceptedIssuers();
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
index 646dfed..dde1122 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/Http.java
@@ -26,8 +26,6 @@ import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.protocol.selenium.HttpResponse;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
index 9c7a3f3..3471a95 100644
--- a/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
+++ b/src/plugin/scoring-orphan/src/java/org/apache/nutch/scoring/orphan/OrphanScoringFilter.java
@@ -68,9 +68,8 @@ public class OrphanScoringFilter extends AbstractScoringFilter {
* CrawlDatum
* @param datum
* new CrawlDatum
- * @param inLinks
+ * @param inlinks
* list of inlinked CrawlDatums
- * @return void
*/
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
List<CrawlDatum> inlinks) throws ScoringFilterException {
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
index ff253fe..32278ee 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/cosine/Model.java
@@ -100,6 +100,7 @@ public class Model {
* @param content The text to tokenize
* @param mingram Value of mingram for tokenizing
* @param maxgram Value of maxgram for tokenizing
+ * @return The created {@link DocVector}
*/
public static DocVector createDocVector(String content, int mingram, int maxgram) {
LuceneTokenizer tokenizer;
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
index 2677f9e..eae5ba5 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneAnalyzerUtil.java
@@ -42,7 +42,14 @@ public class LuceneAnalyzerUtil extends Analyzer{
/**
- * Creates an analyzer instance based on Lucene default stopword set if @param useStopFilter is set to true
+ * Creates an analyzer instance based on Lucene default stopword
+ * set if the param useStopFilter is set to true
+ * @param stemFilterType a preferred {@link StemFilterType} to use. Can be one
+ * of {@link LuceneAnalyzerUtil.StemFilterType#PORTERSTEM_FILTER},
+ * {@link LuceneAnalyzerUtil.StemFilterType#ENGLISHMINIMALSTEM_FILTER}, or
+ * {@link LuceneAnalyzerUtil.StemFilterType#NONE}
+ * @param useStopFilter if true use the default Lucene stopword set,
+ * false otherwise
*/
public LuceneAnalyzerUtil(StemFilterType stemFilterType, boolean useStopFilter) {
LuceneAnalyzerUtil.stemFilterType = stemFilterType;
@@ -55,8 +62,17 @@ public class LuceneAnalyzerUtil extends Analyzer{
}
/**
- * Creates an analyzer instance based on user provided stop words. If @param addToDefault is set to true, then
+ * Creates an analyzer instance based on user provided stop words. If the
+ * param addToDefault is set to true, then
* user provided stop words will be added to the Lucene default stopset.
+ * @param stemFilterType a preferred {@link StemFilterType} to use. Can be one
+ * of {@link LuceneAnalyzerUtil.StemFilterType#PORTERSTEM_FILTER},
+ * {@link LuceneAnalyzerUtil.StemFilterType#ENGLISHMINIMALSTEM_FILTER}, or
+ * {@link LuceneAnalyzerUtil.StemFilterType#NONE}
+ * @param stopWords a {@link List} of stop word Strings
+ * @param addToDefault if true the provided stop words will be added to the
+ * default Lucene stopword set,
+ * false otherwise
*/
public LuceneAnalyzerUtil(StemFilterType stemFilterType, List<String> stopWords, boolean addToDefault) {
LuceneAnalyzerUtil.stemFilterType = stemFilterType;
diff --git a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
index 918af9b..d09af82 100644
--- a/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
+++ b/src/plugin/scoring-similarity/src/java/org/apache/nutch/scoring/similarity/util/LuceneTokenizer.java
@@ -46,9 +46,13 @@ public class LuceneTokenizer {
* @param content - The text to tokenize
* @param tokenizer - the type of tokenizer to use CLASSIC or DEFAULT
* @param useStopFilter - if set to true the token stream will be filtered using default Lucene stopset
- * @param stemFilterType - Type of stemming to perform
+ * @param stemFilterType a preferred {@link StemFilterType} to use. Can be one
+ * of {@link LuceneAnalyzerUtil.StemFilterType#PORTERSTEM_FILTER},
+ * {@link LuceneAnalyzerUtil.StemFilterType#ENGLISHMINIMALSTEM_FILTER}, or
+ * {@link LuceneAnalyzerUtil.StemFilterType#NONE}
*/
- public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter, StemFilterType stemFilterType) {
+ public LuceneTokenizer(String content, TokenizerType tokenizer, boolean useStopFilter,
+ StemFilterType stemFilterType) {
this.tokenizer = tokenizer;
this.stemFilterType = stemFilterType;
if(useStopFilter) {
@@ -64,9 +68,13 @@ public class LuceneTokenizer {
* @param stopWords - Provide a set of user defined stop words
* @param addToDefault - If set to true, the stopSet words will be added to the Lucene default stop set.
* If false, then only the user provided words will be used as the stop set
- * @param stemFilterType
+ * @param stemFilterType a preferred {@link StemFilterType} to use. Can be one
+ * of {@link LuceneAnalyzerUtil.StemFilterType#PORTERSTEM_FILTER},
+ * {@link LuceneAnalyzerUtil.StemFilterType#ENGLISHMINIMALSTEM_FILTER}, or
+ * {@link LuceneAnalyzerUtil.StemFilterType#NONE}
*/
- public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords, boolean addToDefault, StemFilterType stemFilterType) {
+ public LuceneTokenizer(String content, TokenizerType tokenizer, List<String> stopWords,
+ boolean addToDefault, StemFilterType stemFilterType) {
this.tokenizer = tokenizer;
this.stemFilterType = stemFilterType;
if(addToDefault) {
@@ -83,8 +91,8 @@ public class LuceneTokenizer {
}
/**
- * Returns the tokenStream created by the Tokenizer
- * @return
+ * get the tokenStream created by {@link org.apache.lucene.analysis.Tokenizer}
+ * @return The {@link TokenStream}
*/
public TokenStream getTokenStream() {
return tokenStream;
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
index ff475bc..79e03b6 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
@@ -127,9 +127,9 @@ public class CollectionManager extends Configured {
}
/**
- * Returns named subcollection
+ * Get the named subcollection
*
- * @param id
+ * @param id the id of a subcollection ot retrieve
* @return Named SubCollection (or null if not existing)
*/
public Subcollection getSubColection(final String id) {
@@ -141,6 +141,8 @@ public class CollectionManager extends Configured {
*
* @param id
* Id of SubCollection to delete
+ * @throws IOException If there is an error retrieving and deleting
+ * the subcollection from the collection.
*/
public void deleteSubCollection(final String id) throws IOException {
final Subcollection subCol = getSubColection(id);
@@ -152,6 +154,8 @@ public class CollectionManager extends Configured {
/**
* Create a new subcollection.
*
+ * @param id
+ * Id of SubCollection to create
* @param name
* Name of SubCollection to create
* @return Created SubCollection or null if allready existed
@@ -172,7 +176,7 @@ public class CollectionManager extends Configured {
*
* @param url
* The url to test against Collections
- * @return Subcollections
+ * @return A {@link List} of {@link Subcollection}'s
*/
public List<Subcollection> getSubCollections(final String url) {
List<Subcollection> collections = new ArrayList<Subcollection>();
@@ -203,7 +207,10 @@ public class CollectionManager extends Configured {
/**
* Save collections into file
*
- * @throws IOException
+ * @throws IOException If there is a fatal error flushing or
+ * closing the {@link FileOutputStream} associated with the save
+ * process.
+ *
*/
public void save() throws IOException {
try {
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
index 36f33ca..007eeae 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
@@ -80,9 +80,10 @@ public class Subcollection extends Configured implements URLFilter {
* public Constructor
*
* @param id
- * id of SubCollection
+ * Id of SubCollection
* @param name
- * name of SubCollection
+ * Name of SubCollection
+ * @param conf A populated {@link Configuration}
*/
public Subcollection(String id, String name, Configuration conf) {
this(id, name, null, conf);
@@ -92,9 +93,11 @@ public class Subcollection extends Configured implements URLFilter {
* public Constructor
*
* @param id
- * id of SubCollection
+ * Id of SubCollection
* @param name
- * name of SubCollection
+ * Name of SubCollection
+ * @param key SubCollection key
+ * @param conf A populated {@link Configuration}
*/
public Subcollection(String id, String name, String key, Configuration conf) {
this(conf);
@@ -199,7 +202,8 @@ public class Subcollection extends Configured implements URLFilter {
/**
* Initialize Subcollection from dom element
*
- * @param collection
+ * @param collection A DOM {@link org.w3c.dom.Element} for use
+ * in creating the {@link Subcollection}
*/
public void initialize(Element collection) {
this.id = DOMUtil.getChildText(
@@ -226,11 +230,11 @@ public class Subcollection extends Configured implements URLFilter {
}
/**
- * Create a list of patterns from chunk of text, patterns are separated with
- * newline
+ * Create a list of patterns from a chunk of text, patterns are separated
+ * with a newline
*
- * @param list
- * @param text
+ * @param list An initialized {@link List} to insert String patterns.
+ * @param text A chunkl fo text (hopefully) containing patterns.
*/
protected void parseList(List<String> list, String text) {
list.clear();
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index 767d54d..c7ba54e 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -47,7 +47,7 @@ public class SubcollectionIndexingFilter extends Configured implements
}
/**
- * @param conf
+ * @param conf A populated {@link Configuration}
*/
public void setConf(Configuration conf) {
this.conf = conf;
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index a9c2023..ff3826a 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -83,7 +83,7 @@ import java.net.MalformedURLException;
* The configuration shown below will accept all URLs with '.html' or '.htm'
* suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
* all other suffixes.
- * <p>
+ * </p>
*
* <pre>
* # this is a comment
@@ -100,7 +100,7 @@ import java.net.MalformedURLException;
* <p>
* The configuration shown below will accept all URLs except common graphical
* formats.
- * <p>
+ * </p>
*
* <pre>
* # this is a comment
diff --git a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
index ae860b6..b596400 100644
--- a/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
+++ b/src/plugin/urlnormalizer-ajax/src/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -55,8 +55,10 @@ public class AjaxURLNormalizer implements URLNormalizer {
/**
* Attempts to normalize the input URL string
*
- * @param String urlString
+ * @param urlString a String to process
+ * @param scope used when indexing URLs
* @return String
+ * @throws MalformedURLException if the urlString is malformed
*/
public String normalize(String urlString, String scope) throws MalformedURLException {
LOG.info(scope + " // " + urlString);
@@ -80,8 +82,9 @@ public class AjaxURLNormalizer implements URLNormalizer {
* Returns a normalized input URL. #! querystrings are transformed
* to a _escaped_fragment_ form.
*
- * @param String urlString
+ * @param urlString a String to process
* @return String
+ * @throws MalformedURLException if the urlString is malformed
*/
protected String normalizeHashedFragment(String urlString) throws MalformedURLException {
URL u = new URL(urlString);
@@ -109,8 +112,9 @@ public class AjaxURLNormalizer implements URLNormalizer {
* Returns a normalized input URL. _escaped_fragment_ querystrings are
* transformed to a #! form.
*
- * @param String urlString
+ * @param urlString a String to process
* @return String
+ * @throws MalformedURLException if the urlString is malformed
*/
protected String normalizeEscapedFragment(String urlString) throws MalformedURLException {
URL u = new URL(urlString);
@@ -167,7 +171,7 @@ public class AjaxURLNormalizer implements URLNormalizer {
/**
* Unescape some exotic characters in the fragment part
*
- * @param String fragmentPart
+ * @param fragmentPart a String to process
* @return String
*/
protected String unescape(String fragmentPart) {
@@ -183,7 +187,7 @@ public class AjaxURLNormalizer implements URLNormalizer {
/**
* Escape some exotic characters in the fragment part
*
- * @param String fragmentPart
+ * @param fragmentPart a String to process
* @return String
*/
protected String escape(String fragmentPart) {
@@ -218,7 +222,7 @@ public class AjaxURLNormalizer implements URLNormalizer {
}
/**
- * @param Configuration conf
+ * @param conf a populated {@link Configuration}
*/
public void setConf(Configuration conf) {
this.conf = conf;
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
index 7d765f4..63cee14 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/package-info.java
@@ -25,7 +25,7 @@
* </ul>
*
* E.g.,
- * <code>https://www.example.org/a/../b//./select%2Dlang.php?lang=español#anchor<code>
+ * <code>https://www.example.org/a/../b//./select%2Dlang.php?lang=español#anchor</code>
* is normalized to <code>https://www.example.org/b/select-lang.php?lang=espa%C3%B1ol</code>
*
* Optional and configurable normalizations are:
diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
index 11048c3..7ccc423 100644
--- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
+++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
@@ -106,8 +106,13 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer {
}
/**
- * Constructor which can be passed the file name, so it doesn't look in the
- * configuration files for it.
+ * Constructor which can be passed the configuration file name,
+ * so it doesn't look in other configuration files for it.
+ * @param conf A populated {@link Configuration}
+ * @param filename A specific configuration file
+ * @throws IOException if there is an error locatingf the specified input file
+ * @throws PatternSyntaxException If there is an error whilst interpreting
+ * rule patterns.
*/
public RegexURLNormalizer(Configuration conf, String filename)
throws IOException, PatternSyntaxException {
@@ -158,6 +163,9 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer {
/**
* This function does the replacements by iterating through all the regex
* patterns. It accepts a string url as input and returns the altered string.
+ * @param urlString A url string to process
+ * @param scope The identifier for a specific scoped rule
+ * @return The altered string
*/
public String regexNormalize(String urlString, String scope) {
HashMap<String, List<Rule>> scopedRules = getScopedRules();
@@ -276,7 +284,13 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer {
return rules;
}
- /** Spits out patterns and substitutions that are in the configuration file. */
+ /**
+ * Spits out patterns and substitutions that are in the configuration file.
+ * @param args accepts one argument which is a scope
+ * @throws IOException Can be thrown by {@link RegexURLNormalizer#normalize(String, String)}
+ * @throws PatternSyntaxException If there is an error with the provided scope
+ * rule pattern.
+ */
public static void main(String args[]) throws PatternSyntaxException,
IOException {
RegexURLNormalizer normalizer = new RegexURLNormalizer();