You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2021/05/14 10:08:55 UTC
svn commit: r1889891 [2/3] - in
/nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources: ./
configuration.xsl nutch-default.xml
Added: nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml?rev=1889891&view=auto
==============================================================================
--- nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml (added)
+++ nutch/cms_site/trunk/content/apidocs/apidocs-1.18/resources/nutch-default.xml Fri May 14 10:08:55 2021
@@ -0,0 +1,2764 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<!-- Do not modify this file directly. Instead, copy entries that you -->
+<!-- wish to modify from this file into nutch-site.xml and change them -->
+<!-- there. If nutch-site.xml does not already exist, create it. -->
+
+<configuration>
+
+<!-- general properties -->
+
+<property>
+ <name>store.ip.address</name>
+ <value>false</value>
+ <description>Enables us to capture the specific IP address
+ (InetSocketAddress) of the host which we connect to via
+ the given protocol. Currently supported is protocol-ftp and
+ http.
+ </description>
+</property>
+
+<!-- file properties -->
+
+<property>
+ <name>file.content.limit</name>
+ <value>1048576</value>
+ <description>The length limit for downloaded content using the file://
+ protocol, in bytes. If this value is non-negative (>=0), content longer
+ than it will be truncated; otherwise, no truncation at all. Do not
+ confuse this setting with the http.content.limit setting.
+ </description>
+</property>
+
+<property>
+ <name>file.crawl.parent</name>
+ <value>true</value>
+ <description>The crawler is not restricted to the directories that you specified in the
+ URLs file but it is jumping into the parent directories as well. For your own crawlings you can
+ change this behavior (set to false) the way that only directories beneath the directories that you specify get
+ crawled.</description>
+</property>
+
+<property>
+ <name>file.crawl.redirect_noncanonical</name>
+ <value>true</value>
+ <description>
+ If true, protocol-file treats non-canonical file names as
+ redirects and does not canonicalize file names internally. A file
+ name containing symbolic links as path elements is then not
+ resolved and "fetched" but recorded as redirect with the
+ canonical name (all links on path are resolved) as redirect
+ target.
+ </description>
+</property>
+
+<property>
+ <name>file.content.ignored</name>
+ <value>true</value>
+ <description>If true, no file content will be saved during fetch.
+ And it is probably what we want to set most of time, since file:// URLs
+ are meant to be local and we can always use them directly at parsing
+ and indexing stages. Otherwise file contents will be saved.
+ !! NOT IMPLEMENTED YET !!
+ </description>
+</property>
+
+<!-- HTTP properties -->
+
+<property>
+ <name>http.agent.name</name>
+ <value></value>
+ <description>HTTP 'User-Agent' request header. MUST NOT be empty -
+ please set this to a single word uniquely related to your organization.
+
+ NOTE: You should also check other related properties:
+
+ http.robots.agents
+ http.agent.description
+ http.agent.url
+ http.agent.email
+ http.agent.version
+
+ and set their values appropriately.
+
+ </description>
+</property>
+
+<property>
+ <name>http.robots.agents</name>
+ <value></value>
+ <description>Any other agents, apart from 'http.agent.name', that the robots
+ parser would look for in robots.txt. Multiple agents can be provided using
+ comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+
+ The ordering of agents does NOT matter and the robots parser would make
+ decision based on the agent which matches first to the robots rules.
+ Also, there is NO need to add a wildcard (ie. "*") to this string as the
+ robots parser would smartly take care of a no-match situation.
+
+ If no value is specified, by default HTTP agent (ie. 'http.agent.name')
+ would be used for user agent matching by the robots parser.
+ </description>
+</property>
+
+<property>
+ <name>http.robot.rules.whitelist</name>
+ <value></value>
+ <description>Comma separated list of hostnames or IP addresses to ignore
+ robot rules parsing for. Use with care and only if you are explicitly
+ allowed by the site owner to ignore the site's robots.txt!
+ </description>
+</property>
+
+<property>
+ <name>http.robots.403.allow</name>
+ <value>true</value>
+ <description>Some servers return HTTP status 403 (Forbidden) if
+ /robots.txt doesn't exist. This should probably mean that we are
+ allowed to crawl the site nonetheless. If this is set to false,
+ then such sites will be treated as forbidden.</description>
+</property>
+
+<property>
+ <name>http.agent.description</name>
+ <value></value>
+ <description>Further description of our bot- this text is used in
+ the User-Agent header. It appears in parenthesis after the agent name.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.url</name>
+ <value></value>
+ <description>A URL to advertise in the User-Agent header. This will
+ appear in parenthesis after the agent name. Custom dictates that this
+ should be a URL of a page explaining the purpose and behavior of this
+ crawler.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.email</name>
+ <value></value>
+ <description>An email address to advertise in the HTTP 'From' request
+ header and User-Agent header. A good practice is to mangle this
+ address (e.g. 'info at example dot com') to avoid spamming.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.version</name>
+ <value>Nutch-1.18</value>
+ <description>A version string to advertise in the User-Agent
+ header.</description>
+</property>
+
+<property>
+ <name>http.agent.rotate</name>
+ <value>false</value>
+ <description>
+ If true, instead of http.agent.name, alternating agent names are
+ chosen from a list provided via http.agent.rotate.file.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.rotate.file</name>
+ <value>agents.txt</value>
+ <description>
+ File containing alternative user agent names to be used instead of
+ http.agent.name on a rotating basis if http.agent.rotate is true.
+ Each line of the file should contain exactly one agent
+ specification including name, version, description, URL, etc.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.host.cookie.file</name>
+ <value>cookies.txt</value>
+ <description>
+ File containing per-host configured cookies.
+ </description>
+</property>
+
+<property>
+ <name>http.agent.host</name>
+ <value></value>
+ <description>Name or IP address of the host on which the Nutch crawler
+ would be running. Currently this is used by 'protocol-httpclient'
+ plugin.
+ </description>
+</property>
+
+<property>
+ <name>http.timeout</name>
+ <value>10000</value>
+ <description>The default network timeout, in milliseconds.</description>
+</property>
+
+<property>
+ <name>http.content.limit</name>
+ <value>1048576</value>
+ <description>The length limit for downloaded content using the http/https
+ protocols, in bytes. If this value is non-negative (>=0), content longer
+ than it will be truncated; otherwise, no truncation at all. Do not
+ confuse this setting with the file.content.limit setting.
+ </description>
+</property>
+
+<property>
+ <name>http.time.limit</name>
+ <value>-1</value>
+ <description>The time limit in seconds to fetch a single document.
+ If this value is non-negative (>=0), the HTTP protocol implementation
+ will stop reading from a socket after http.time.limit seconds have
+ been spent for fetching this document. The HTTP response is then
+ marked as truncated. The http.time.limit should be set to a longer
+ time period than http.timeout, as it applies to the entire duration
+ to fetch a document, not only the network timeout of a single I/O
+ operation. Note: supported only by protocol-okhttp.
+ </description>
+</property>
+
+<property>
+ <name>http.partial.truncated</name>
+ <value>false</value>
+ <description>
+ If true the HTTP protocol implementation may store the content of
+ partial fetches and mark the response as truncated instead of
+ throwing an exception which will cause the fetch to fail. This
+ allows to use the data which has already been fetched, instead of
+ retrying the fetch later. Note: supported only by protocol-okhttp.
+ </description>
+</property>
+
+<property>
+ <name>http.tls.certificates.check</name>
+ <value>false</value>
+ <description>
+ Whether to check the TLS/SSL server certificates for validity.
+ If true invalid (e.g., self-signed or expired) certificates are
+ rejected and the https connection is failed. If false insecure
+ TLS/SSL connections are allowed. Note that this property is
+ currently not supported by all http/https protocol plugins.
+ </description>
+</property>
+
+<property>
+ <name>http.proxy.host</name>
+ <value></value>
+ <description>The proxy hostname. If empty, no proxy is used.</description>
+</property>
+
+<property>
+ <name>http.proxy.port</name>
+ <value></value>
+ <description>The proxy port.</description>
+</property>
+
+<property>
+ <name>http.proxy.username</name>
+ <value></value>
+ <description>Username for proxy. This will be used by
+ 'protocol-httpclient', if the proxy server requests basic, digest
+ and/or NTLM authentication. To use this, 'protocol-httpclient' must
+ be present in the value of 'plugin.includes' property.
+ NOTE: For NTLM authentication, do not prefix the username with the
+ domain, i.e. 'susam' is correct whereas 'DOMAIN\susam' is incorrect.
+ </description>
+</property>
+
+<property>
+ <name>http.proxy.password</name>
+ <value></value>
+ <description>Password for proxy. This will be used by
+ 'protocol-httpclient', if the proxy server requests basic, digest
+ and/or NTLM authentication. To use this, 'protocol-httpclient' must
+ be present in the value of 'plugin.includes' property.
+ </description>
+</property>
+
+<property>
+ <name>http.proxy.realm</name>
+ <value></value>
+ <description>Authentication realm for proxy. Do not define a value
+ if realm is not required or authentication should take place for any
+ realm. NTLM does not use the notion of realms. Specify the domain name
+ of NTLM authentication as the value for this property. To use this,
+ 'protocol-httpclient' must be present in the value of
+ 'plugin.includes' property.
+ </description>
+</property>
+
+<property>
+ <name>http.auth.file</name>
+ <value>httpclient-auth.xml</value>
+ <description>Authentication configuration file for
+ 'protocol-httpclient' plugin.
+ </description>
+</property>
+
+<property>
+ <name>http.proxy.type</name>
+ <value>HTTP</value>
+ <description>
+ Proxy type: HTTP or SOCKS (cf. java.net.Proxy.Type).
+ Note: supported by protocol-okhttp.
+ </description>
+</property>
+
+<property>
+ <name>http.proxy.exception.list</name>
+ <value></value>
+ <description>A comma separated list of hosts that don't use the proxy
+ (e.g. intranets). Example: www.apache.org</description>
+</property>
+
+<property>
+ <name>http.useHttp11</name>
+ <value>true</value>
+ <description>
+ If true, use HTTP 1.1, if false use HTTP 1.0 .
+ </description>
+</property>
+
+<property>
+ <name>http.useHttp2</name>
+ <value>false</value>
+ <description>
+ If true try HTTP/2 and fall-back to HTTP/1.1 if HTTP/2 not
+ supported, if false use always HTTP/1.1.
+
+ NOTE: HTTP/2 is currently only supported by protocol-okhttp and
+ requires at runtime Java 9 or a modified Java 8 with support for
+ ALPN (Application Layer Protocol Negotiation).
+ </description>
+</property>
+
+<property>
+ <name>http.accept.language</name>
+ <value>en-us,en-gb,en;q=0.7,*;q=0.3</value>
+ <description>Value of the "Accept-Language" request header field.
+ This allows selecting non-English language as default one to retrieve.
+ It is a useful setting for search engines build for certain national group.
+ To send requests without "Accept-Language" header field, thi property must
+ be configured to contain a space character because an empty property does
+ not overwrite the default.
+ </description>
+</property>
+
+<property>
+ <name>http.accept</name>
+ <value>text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8</value>
+ <description>Value of the "Accept" request header field. A space character
+ as value will cause that no "Accept" header field is sent in the request.
+ </description>
+</property>
+
+<property>
+ <name>http.accept.charset</name>
+ <value>utf-8,iso-8859-1;q=0.7,*;q=0.7</value>
+ <description>Value of the "Accept-Charset" request header field. A space character
+ as value will cause that no "Accept-Charset" header field is sent in the request.
+ </description>
+</property>
+
+<property>
+ <name>http.store.responsetime</name>
+ <value>true</value>
+ <description>Enables us to record the response time of the
+ host which is the time period between start connection to end
+ connection of a pages host. The response time in milliseconds
+ is stored in CrawlDb in CrawlDatum's meta data under key "_rs_"
+ </description>
+</property>
+
+<property>
+ <name>http.enable.if.modified.since.header</name>
+ <value>true</value>
+ <description>Whether Nutch sends an HTTP If-Modified-Since header. It reduces
+ bandwidth when enabled by not downloading pages that respond with an HTTP
+ Not-Modified header. URLs that are not downloaded are not passed through
+ parse or indexing filters. If you regularly modify filters, you should force
+ Nutch to also download unmodified pages by disabling this feature.
+ </description>
+</property>
+
+<property>
+ <name>http.enable.cookie.header</name>
+ <value>true</value>
+ <description>Whether Nutch sends an HTTP Cookie header. The cookie value
+ is read from the CrawlDatum Cookie metadata field.
+ </description>
+</property>
+
+<!-- FTP properties -->
+
+<property>
+ <name>ftp.username</name>
+ <value>anonymous</value>
+ <description>ftp login username.</description>
+</property>
+
+<property>
+ <name>ftp.password</name>
+ <value>anonymous@example.com</value>
+ <description>ftp login password.</description>
+</property>
+
+<property>
+ <name>ftp.content.limit</name>
+ <value>1048576</value>
+ <description>The length limit for downloaded content, in bytes.
+ If this value is non-negative (>=0), content longer than it will be truncated;
+ otherwise, no truncation at all.
+ Caution: classical ftp RFCs never defines partial transfer and, in fact,
+ some ftp servers out there do not handle client side forced close-down very
+ well. Our implementation tries its best to handle such situations smoothly.
+ </description>
+</property>
+
+<property>
+ <name>ftp.timeout</name>
+ <value>60000</value>
+ <description>Default timeout for ftp client socket, in millisec.
+ Please also see ftp.keep.connection below.</description>
+</property>
+
+<property>
+ <name>ftp.server.timeout</name>
+ <value>100000</value>
+ <description>An estimation of ftp server idle time, in millisec.
+ Typically it is 120000 millisec for many ftp servers out there.
+ Better be conservative here. Together with ftp.timeout, it is used to
+ decide if we need to delete (annihilate) current ftp.client instance and
+ force to start another ftp.client instance anew. This is necessary because
+ a fetcher thread may not be able to obtain next request from queue in time
+ (due to idleness) before our ftp client times out or remote server
+ disconnects. Used only when ftp.keep.connection is true (please see below).
+ </description>
+</property>
+
+<property>
+ <name>ftp.keep.connection</name>
+ <value>false</value>
+ <description>Whether to keep ftp connection. Useful if crawling same host
+ again and again. When set to true, it avoids connection, login and dir list
+ parser setup for subsequent URLs. If it is set to true, however, you must
+ make sure (roughly):
+ (1) ftp.timeout is less than ftp.server.timeout
+ (2) ftp.timeout is larger than (fetcher.threads.fetch * fetcher.server.delay)
+ Otherwise there will be too many "delete client because idled too long"
+ messages in thread logs.</description>
+</property>
+
+<property>
+ <name>ftp.follow.talk</name>
+ <value>false</value>
+ <description>Whether to log dialogue between our client and remote
+ server. Useful for debugging.</description>
+</property>
+
+<!-- web db properties -->
+<property>
+ <name>db.fetch.interval.default</name>
+ <value>2592000</value>
+ <description>The default number of seconds between re-fetches of a page (30 days).
+ </description>
+</property>
+
+<property>
+ <name>db.fetch.interval.max</name>
+ <value>7776000</value>
+ <description>The maximum number of seconds between re-fetches of a page
+ (90 days). After this period every page in the db will be re-tried, no
+ matter what is its status.
+ </description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.class</name>
+ <value>org.apache.nutch.crawl.DefaultFetchSchedule</value>
+ <description>The implementation of fetch schedule. DefaultFetchSchedule simply
+ adds the original fetchInterval to the last fetch time, regardless of
+ page changes, whereas AdaptiveFetchSchedule (see below) tries to adapt
+ to the rate at which a given page is changed.
+ </description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.adaptive.inc_rate</name>
+ <value>0.4</value>
+ <description>If a page is unmodified, its fetchInterval will be
+ increased by this rate. This value should not
+ exceed 0.5, otherwise the algorithm becomes unstable.</description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.adaptive.dec_rate</name>
+ <value>0.2</value>
+ <description>If a page is modified, its fetchInterval will be
+ decreased by this rate. This value should not
+ exceed 0.5, otherwise the algorithm becomes unstable.</description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.adaptive.min_interval</name>
+ <value>60.0</value>
+ <description>Minimum fetchInterval, in seconds.</description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.adaptive.max_interval</name>
+ <value>31536000.0</value>
+ <description>Maximum fetchInterval, in seconds (365 days).
+ NOTE: this is limited by db.fetch.interval.max. Pages with
+ fetchInterval larger than db.fetch.interval.max
+ will be fetched anyway.</description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.adaptive.sync_delta</name>
+ <value>true</value>
+ <description>If true, try to synchronize with the time of page change.
+ by shifting the next fetchTime by a fraction (sync_rate) of the difference
+ between the last modification time, and the last fetch time.</description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.adaptive.sync_delta_rate</name>
+ <value>0.3</value>
+ <description>See sync_delta for description. This value should not
+ exceed 0.5, otherwise the algorithm becomes unstable.</description>
+</property>
+
+<property>
+ <name>db.fetch.schedule.mime.file</name>
+ <value>adaptive-mimetypes.txt</value>
+ <description>The configuration file for the MimeAdaptiveFetchSchedule.
+ </description>
+</property>
+
+<property>
+ <name>db.update.additions.allowed</name>
+ <value>true</value>
+ <description>If true, updatedb will add newly discovered URLs, if false
+ only already existing URLs in the CrawlDb will be updated and no new
+ URLs will be added.
+ </description>
+</property>
+
+<property>
+ <name>db.preserve.backup</name>
+ <value>true</value>
+ <description>If true, updatedb will keep a backup of the previous CrawlDB
+ version in the old directory. In case of disaster, one can rename old to
+ current and restore the CrawlDB to its previous state.
+ </description>
+</property>
+
+<property>
+ <name>db.update.purge.404</name>
+ <value>false</value>
+ <description>If true, updatedb will add purge records with status DB_GONE
+ from the CrawlDB.
+ </description>
+</property>
+
+<property>
+ <name>db.update.purge.orphans</name>
+ <value>false</value>
+ <description>If true, updatedb will permanently delete URLs marked
+ as orphan from the CrawlDb. The plugin scoring-orphan needs to be
+ activated to get records marked as orphan. See the plugin's options
+ elsewhere in this document.
+ </description>
+</property>
+
+<property>
+ <name>crawldb.url.normalizers</name>
+ <value>false</value>
+ <description>
+ !Temporary, can be overwritten with the command line!
+ Normalize URLs when updating crawldb
+ </description>
+</property>
+
+<property>
+ <name>crawldb.url.filters</name>
+ <value>false</value>
+ <description>
+ !Temporary, can be overwritten with the command line!
+ Filter URLS when updating crawldb
+ </description>
+</property>
+
+<property>
+ <name>db.update.max.inlinks</name>
+ <value>10000</value>
+ <description>Maximum number of inlinks to take into account when updating
+ a URL score in the crawlDB. Only the best scoring inlinks are kept.
+ </description>
+</property>
+
+<property>
+ <name>db.ignore.internal.links</name>
+ <value>false</value>
+ <description>If true, outlinks leading from a page to internal hosts or domain
+ will be ignored. This is an effective way to limit the crawl to include
+ only initially injected hosts or domains, without creating complex URLFilters.
+ See 'db.ignore.external.links.mode'.
+ </description>
+</property>
+
+<property>
+ <name>db.ignore.external.links</name>
+ <value>false</value>
+ <description>If true, outlinks leading from a page to external hosts or domain
+ will be ignored. This is an effective way to limit the crawl to include
+ only initially injected hosts or domains, without creating complex URLFilters.
+ See 'db.ignore.external.links.mode'.
+ </description>
+</property>
+
+<property>
+ <name>db.ignore.also.redirects</name>
+ <value>true</value>
+ <description>If true, the fetcher checks redirects the same way as
+ links when ignoring internal or external links. Set to false to
+ follow redirects despite the values for db.ignore.external.links and
+ db.ignore.internal.links.
+ </description>
+</property>
+
+<property>
+ <name>db.ignore.external.links.mode</name>
+ <value>byHost</value>
+ <description>Alternative value is byDomain</description>
+</property>
+
+ <property>
+ <name>db.ignore.external.exemptions.file</name>
+ <value>db-ignore-external-exemptions.txt</value>
+ <description>
+ This file contains exemption rules used by 'urlfiter-ignoreexempt' plugin
+ </description>
+</property>
+
+<property>
+ <name>db.injector.overwrite</name>
+ <value>false</value>
+ <description>Whether existing records in the CrawlDB will be overwritten
+ by injected records.
+ </description>
+</property>
+
+<property>
+ <name>db.injector.update</name>
+ <value>false</value>
+ <description>If true existing records in the CrawlDB will be updated with
+ injected records. Old meta data is preserved. The db.injector.overwrite
+ parameter has precedence.
+ </description>
+</property>
+
+<property>
+ <name>db.score.injected</name>
+ <value>1.0</value>
+ <description>The score of new pages added by the injector.
+ </description>
+</property>
+
+<property>
+ <name>db.score.link.external</name>
+ <value>1.0</value>
+ <description>The score factor for new pages added due to a link from
+ another host relative to the referencing page's score. Scoring plugins
+ may use this value to affect initial scores of external links.
+ </description>
+</property>
+
+<property>
+ <name>db.score.link.internal</name>
+ <value>1.0</value>
+ <description>The score factor for pages added due to a link from the
+ same host, relative to the referencing page's score. Scoring plugins
+ may use this value to affect initial scores of internal links.
+ </description>
+</property>
+
+<property>
+ <name>db.score.count.filtered</name>
+ <value>false</value>
+ <description>The score value passed to newly discovered pages is
+ calculated as a fraction of the original page score divided by the
+ number of outlinks. If this option is false, only the outlinks that passed
+ URLFilters will count, if it's true then all outlinks will count.
+ </description>
+</property>
+
+<property>
+ <name>db.max.outlinks.per.page</name>
+ <value>100</value>
+ <description>The maximum number of outlinks that we'll process for a page.
+ If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
+ will be processed for a page; otherwise, all outlinks will be processed.
+ </description>
+</property>
+
+<property>
+ <name>db.max.outlink.length</name>
+ <value>4096</value>
+ <description>
+ The maximum length in characters accepted for outlinks before
+ applying URL normalizers and filters. If this value is
+ nonnegative (>=0), only URLs with a length in characters less or
+ equal than db.max.outlink.length are accepted and then passed to
+ URL normalizers and filters. Doing the length check beforehand
+ avoids that normalizers or filters hang up on overlong URLs.
+ Note: this property is only used to check URLs found as outlinks
+ and redirects, but not for injected URLs.
+ </description>
+</property>
+
+<property>
+ <name>db.parsemeta.to.crawldb</name>
+ <value></value>
+ <description>Comma-separated list of parse metadata keys to transfer to the crawldb (NUTCH-779).
+ Assuming for instance that the languageidentifier plugin is enabled, setting the value to 'lang'
+ will copy both the key 'lang' and its value to the corresponding entry in the crawldb.
+ </description>
+</property>
+
+<property>
+ <name>db.fetch.retry.max</name>
+ <value>3</value>
+ <description>The maximum number of times a URL that has encountered
+ recoverable errors is generated for fetch.</description>
+</property>
+
+<property>
+ <name>db.signature.class</name>
+ <value>org.apache.nutch.crawl.MD5Signature</value>
+ <description>The default implementation of a page signature. Signatures
+ created with this implementation will be used for duplicate detection
+ and removal.</description>
+</property>
+
+<property>
+ <name>db.signature.text_profile.min_token_len</name>
+ <value>2</value>
+ <description>Minimum token length to be included in the signature.
+ </description>
+</property>
+
+<property>
+ <name>db.signature.text_profile.quant_rate</name>
+ <value>0.01</value>
+ <description>Profile frequencies will be rounded down to a multiple of
+ QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
+ frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
+ for longer texts tokens with frequency 1 will always be discarded.
+ </description>
+</property>
+
+<property>
+ <name>db.stats.score.quantiles</name>
+ <value>.01,.05,.1,.2,.25,.3,.4,.5,.6,.7,.75,.8,.9,.95,.99</value>
+ <description>
+ Quantiles of the distribution of CrawlDatum scores shown in the
+ CrawlDb statistics (command `readdb -stats'). Comma-separated
+ list of floating point numbers.
+ </description>
+</property>
+
+<!-- linkdb properties -->
+
+<property>
+ <name>linkdb.max.inlinks</name>
+ <value>10000</value>
+ <description>Maximum number of inlinks per URL to be kept in LinkDb.
+ If "invertlinks" finds more inlinks than this number, only the first
+ N inlinks will be stored, and the rest will be discarded.
+ </description>
+</property>
+
+<property>
+ <name>linkdb.ignore.internal.links</name>
+ <value>true</value>
+ <description>If true, when adding new links to a page, links from
+ the same host are ignored. This is an effective way to limit the
+ size of the link database, keeping only the highest quality
+ links.
+ </description>
+</property>
+
+<property>
+ <name>linkdb.ignore.external.links</name>
+ <value>false</value>
+ <description>If true, when adding new links to a page, links from
+ the a different host are ignored.
+ </description>
+</property>
+
+<property>
+ <name>linkdb.max.anchor.length</name>
+ <value>100</value>
+ <description>
+ The maximum number of characters permitted for anchor texts stored
+ in LinkDb.
+ </description>
+</property>
+
+<!-- generate properties -->
+
+<property>
+ <name>generate.max.count</name>
+ <value>-1</value>
+ <description>The maximum number of URLs in a single
+ fetchlist. -1 if unlimited. The URLs are counted according
+ to the value of the parameter generate.count.mode.
+ </description>
+</property>
+
+<property>
+ <name>generate.count.mode</name>
+ <value>host</value>
+ <description>Determines how the URLs are counted for generate.max.count.
+ Default value is 'host' but can be 'domain'. Note that we do not count
+ per IP in the new version of the Generator.
+ </description>
+</property>
+
+<property>
+ <name>generate.update.crawldb</name>
+ <value>false</value>
+ <description>For highly-concurrent environments, where several
+ generate/fetch/update cycles may overlap, setting this to true ensures
+ that generate will create different fetchlists even without intervening
+ updatedb-s, at the cost of running an additional job to update CrawlDB.
+ If false, running generate twice without intervening updatedb will
+ generate identical fetchlists. See also crawl.gen.delay which defines
+ how long items already generated are blocked.</description>
+</property>
+
+<property>
+ <name>generate.min.score</name>
+ <value>0</value>
+ <description>Select only entries with a score larger than
+ generate.min.score.</description>
+</property>
+
+<property>
+ <name>generate.min.interval</name>
+ <value>-1</value>
+ <description>Select only entries with a retry interval lower than
+ generate.min.interval. A value of -1 disables this check.</description>
+</property>
+
+<property>
+ <name>generate.hostdb</name>
+ <value></value>
+ <description>Path to HostDB, required for the generate.max.count.expr
+ and generate.fetch.delay.expr properties.
+ See https://issues.apache.org/jira/browse/NUTCH-2368</description>
+</property>
+
+<property>
+ <name>generate.fetch.delay.expr</name>
+ <value></value>
+ <description>Controls variable fetcher.server.delay via a Jexl expression and
+ HostDB information. It allows you to alter fetch delay based on HostDB data.
+ See https://issues.apache.org/jira/browse/NUTCH-2368</description>
+</property>
+
+<property>
+ <name>generate.max.count.expr</name>
+ <value></value>
+ <description>Controls variable generate.max.count via a Jexl expression and
+ HostDB information. It allows you to alter maxCount based on HostDB data.
+ See https://issues.apache.org/jira/browse/NUTCH-2368</description>
+</property>
+
+<property>
+ <name>generate.restrict.status</name>
+ <value></value>
+ <description>Select only entries of this status, see
+ https://issues.apache.org/jira/browse/NUTCH-1248</description>
+</property>
+
+<!-- urlpartitioner properties -->
+
+<property>
+ <name>partition.url.mode</name>
+ <value>byHost</value>
+ <description>Determines how to partition URLs. Default value is 'byHost',
+ also takes 'byDomain' or 'byIP'.
+ </description>
+</property>
+
+<property>
+ <name>crawl.gen.delay</name>
+ <value>604800000</value>
+ <description>
+ This value, expressed in milliseconds, defines how long we should keep the lock on records
+ in CrawlDb that were just selected for fetching. If these records are not updated
+ in the meantime, the lock is canceled, i.e. they become eligible for selecting again.
+ Default value of this is 7 days (604800000 ms). If generate.update.crawldb is false
+ the property crawl.gen.delay has no effect.
+ </description>
+</property>
+
+<!-- fetcher properties -->
+
+<property>
+ <name>fetcher.server.delay</name>
+ <value>5.0</value>
+ <description>The number of seconds the fetcher will delay between
+ successive requests to the same server. Note that this might get
+ overridden by a Crawl-Delay from a robots.txt and is used ONLY if
+ fetcher.threads.per.queue is set to 1.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.server.min.delay</name>
+ <value>0.0</value>
+ <description>The minimum number of seconds the fetcher will delay between
+ successive requests to the same server. This value is applicable ONLY
+ if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking
+ is turned off).</description>
+</property>
+
+<property>
+ <name>fetcher.max.crawl.delay</name>
+ <value>30</value>
+ <description>
+ If the Crawl-Delay in robots.txt is set to greater than this value (in
+ seconds) then the fetcher will skip this page, generating an error report.
+ If set to -1 the fetcher will never skip such pages and will wait the
+ amount of time retrieved from robots.txt Crawl-Delay, however long that
+ might be.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.min.crawl.delay</name>
+ <value>${fetcher.server.delay}</value>
+ <description>
+ Minimum Crawl-Delay (in seconds) accepted in robots.txt, even if the
+ robots.txt specifies a shorter delay. By default the minimum Crawl-Delay
+ is set to the value of `fetcher.server.delay` which guarantees that
+ a value set in the robots.txt cannot make the crawler more aggressive
+ than the default configuration.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.threads.fetch</name>
+ <value>10</value>
+ <description>The number of FetcherThreads the fetcher should use.
+ This is also determines the maximum number of requests that are
+ made at once (each FetcherThread handles one connection). The total
+ number of threads running in distributed mode will be the number of
+ fetcher threads * number of nodes as fetcher has one map task per node.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.threads.per.queue</name>
+ <value>1</value>
+ <description>This number is the maximum number of threads that
+ should be allowed to access a queue at one time. Setting it to
+ a value > 1 will cause the Crawl-Delay value from robots.txt to
+ be ignored and the value of fetcher.server.min.delay to be used
+ as a delay between successive requests to the same server instead
+ of fetcher.server.delay.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.queue.mode</name>
+ <value>byHost</value>
+ <description>Determines how to put URLs into queues. Default value
+ is 'byHost', also takes 'byDomain' or 'byIP'. Crawl delays are
+ implemented on the level of fetcher queues.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.verbose</name>
+ <value>false</value>
+ <description>If true, fetcher will log more verbosely.</description>
+</property>
+
+<property>
+ <name>http.log.exceptions.suppress.stack</name>
+ <value>java.net.UnknownHostException,java.net.NoRouteToHostException</value>
+ <description>Comma-separated list of exceptions not shown with full
+ stack trace in logs of fetcher and HTTP protocol implementations.
+ The logs may shrink in size significantly, e.g., when for a large
+ unrestricted web crawl unknown hosts are logged shortly without full
+ stack trace. The full class name of the exception class (extending
+ Throwable) including the package path must be specified.</description>
+</property>
+
+<property>
+ <name>fetcher.parse</name>
+ <value>false</value>
+ <description>If true, fetcher will parse content. Default is false, which means
+ that a separate parsing step is required after fetching is finished.</description>
+</property>
+
+<property>
+ <name>fetcher.store.content</name>
+ <value>true</value>
+ <description>If true, fetcher will store content.</description>
+</property>
+
+<property>
+ <name>fetcher.signature</name>
+ <value>false</value>
+ <description>If true, fetcher will generate the signature for
+ successfully fetched documents even if the content is not parsed by
+ fetcher (see property fetcher.parse). Default is false, which means
+ that the signature is calculated when parsing either by the fetcher
+ or during the parsing step. Note that a non-parsing fetcher can
+ only generate signatures based on the binary content and not on the
+ textual content. An appropriate signature class should be chosen
+ (see property db.signature.class).
+ </description>
+</property>
+
+<property>
+ <name>fetcher.timelimit.mins</name>
+ <value>-1</value>
+ <description>This is the number of minutes allocated to the fetching.
+ Once this value is reached, any remaining entry from the input URL list is skipped
+ and all active queues are emptied. The default value of -1 deactivates the time limit.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.max.exceptions.per.queue</name>
+ <value>-1</value>
+ <description>The maximum number of protocol-level exceptions (e.g. timeouts) per
+ host (or IP) queue. Once this value is reached, any remaining entries from this
+ queue are purged, effectively stopping the fetching from this host/IP. The default
+ value of -1 deactivates this limit.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.throughput.threshold.pages</name>
+ <value>-1</value>
+ <description>The threshold of minimum pages per second. If the fetcher downloads less
+ pages per second than the configured threshold, the fetcher stops, preventing slow queue's
+ from stalling the throughput. This threshold must be an integer. This can be useful when
+ fetcher.timelimit.mins is hard to determine. The default value of -1 disables this check.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.throughput.threshold.retries</name>
+ <value>5</value>
+ <description>The number of times the fetcher.throughput.threshold.pages is allowed to be exceeded.
+ This settings prevents accidental slow downs from immediately killing the fetcher thread.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.throughput.threshold.check.after</name>
+ <value>5</value>
+ <description>The number of minutes after which the throughput check is enabled.</description>
+</property>
+
+<property>
+ <name>fetcher.threads.timeout.divisor</name>
+ <value>2</value>
+ <description>(EXPERT)The thread time-out divisor to use. By default threads have a time-out
+ value of mapreduce.task.timeout / 2. Increase this setting if the fetcher waits too
+ long before killing hanged threads. Be careful, a too high setting (+8) will most likely kill the
+ fetcher threads prematurely.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.queue.depth.multiplier</name>
+ <value>50</value>
+ <description>(EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP]
+ (see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter.
+ A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list
+ is not optimal.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.follow.outlinks.depth</name>
+ <value>-1</value>
+ <description>(EXPERT)When fetcher.parse is true and this value is greater than 0 the fetcher will extract outlinks
+ and follow until the desired depth is reached. A value of 1 means all generated pages are fetched and their first degree
+ outlinks are fetched and parsed too. Be careful, this feature is in itself agnostic of the state of the CrawlDB and does not
+ know about already fetched pages. A setting larger than 2 will most likely fetch home pages twice in the same fetch cycle.
+ It is highly recommended to set db.ignore.external.links to true to restrict the outlink follower to URLs within the same
+ domain. When disabled (false) the feature is likely to follow duplicates even when depth=1.
+ A value of -1 of 0 disables this feature.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.follow.outlinks.num.links</name>
+ <value>4</value>
+ <description>(EXPERT)The number of outlinks to follow when fetcher.follow.outlinks.depth is enabled. Be careful, this can multiply
+ the total number of pages to fetch. This works with fetcher.follow.outlinks.depth.divisor, by default settings the followed outlinks
+ at depth 1 is 8, not 4.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.follow.outlinks.depth.divisor</name>
+ <value>2</value>
+ <description>(EXPERT)The divisor of fetcher.follow.outlinks.num.links per fetcher.follow.outlinks.depth. This decreases the number
+ of outlinks to follow by increasing depth. The formula used is: outlinks = floor(divisor / depth * num.links). This prevents
+ exponential growth of the fetch list.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.follow.outlinks.ignore.external</name>
+ <value>true</value>
+ <description>Whether to ignore or follow external links. Set db.ignore.external.links to false and this to true to store outlinks
+ in the output but not follow them. If db.ignore.external.links is true this directive is ignored.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.bandwidth.target</name>
+ <value>-1</value>
+ <description>Target bandwidth in kilobits per sec for each mapper instance. This is used to adjust the number of
+ fetching threads automatically (up to fetcher.maxNum.threads). A value of -1 deactivates the functionality, in which case
+ the number of fetching threads is fixed (see fetcher.threads.fetch).</description>
+</property>
+
+<property>
+ <name>fetcher.maxNum.threads</name>
+ <value>25</value>
+ <description>Max number of fetch threads allowed when using fetcher.bandwidth.target. Defaults to fetcher.threads.fetch if unspecified or
+ set to a value lower than it. </description>
+</property>
+
+<property>
+ <name>fetcher.bandwidth.target.check.everyNSecs</name>
+ <value>30</value>
+ <description>(EXPERT) Value in seconds which determines how frequently we should reassess the optimal number of fetch threads when using
+ fetcher.bandwidth.target. Defaults to 30 and must be at least 1.</description>
+</property>
+
+<property>
+
+ <name>fetcher.store.robotstxt</name>
+ <value>false</value>
+ <description>If true (and fetcher.store.content is also true),
+ fetcher will store the robots.txt response content and status for
+ debugging or archival purposes. The robots.txt is added to the
+ content/ folder of the fetched segment.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.publisher</name>
+ <value>false</value>
+ <description>Set this value to true if you want to use an implementation of the Publisher/Subscriber model. Make sure to set corresponding
+ Publisher implementation specific properties</description>
+</property>
+
+<property>
+ <name>fetcher.filter.urls</name>
+ <value>false</value>
+ <description>Whether fetcher will filter URLs (with the configured URL filters).</description>
+</property>
+
+<property>
+ <name>fetcher.normalize.urls</name>
+ <value>false</value>
+ <description>Whether fetcher will normalize URLs (with the configured URL normalizers).</description>
+</property>
+
+<property>
+ <name>http.redirect.max</name>
+ <value>0</value>
+ <description>The maximum number of redirects the fetcher will follow when
+ trying to fetch a page. If set to negative or 0, fetcher won't immediately
+ follow redirected URLs, instead it will record them for later fetching.
+ </description>
+</property>
+
+<property>
+ <name>http.redirect.max.exceeded.skip</name>
+ <value>false</value>
+ <description>
+ Whether to skip the last URL in a redirect chain when when redirects
+ are followed (http.redirect.max > 0) and the maximum number of redirects
+ in a chain is exceeded (redirect_count > http.redirect.max).
+ If not skipped the redirect target URLs are stored as `linked`
+ and fetched in one of the following cycles. See also NUTCH-2748.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.redirect.dedupcache.seconds</name>
+ <value>-1</value>
+ <description>
+ The maximum time in seconds fetcher will cache redirects for
+ deduplication. If the same redirect URL is seen again withing
+ this time it is skipped. This allows to avoid pathological cases
+ where many or most of the URLs of a host are redirected to the
+ same URL, eg. a page to login, accept cookies, indicating an
+ error. A value less or equal zero disables redirect deduplication.
+ Caveat: This may break setting cookies via recursive redirect chains.
+ </description>
+</property>
+
+<property>
+ <name>fetcher.redirect.dedupcache.size</name>
+ <value>1000</value>
+ <description>
+ The maximum size of the cache to deduplicate redirects,
+ see `fetcher.redirect.dedupcache.seconds`.
+ </description>
+</property>
+
+
+<!-- SegmentReader -->
+<property>
+ <name>segment.reader.content.recode</name>
+ <value>false</value>
+ <description>
+ SegmentReader when dumping segments: If true try to recode content
+ of HTML documents from the original encoding to UTF-8. Note, this
+ property can be overwritten by SegmentReader command-line options.
+ </description>
+</property>
+
+
+
+<!-- any23 plugin properties -->
+
+<property>
+ <name>any23.extractors</name>
+ <value>html-microdata</value>
+ <description>Comma-separated list of Any23 extractors (a list of extractors is available here: http://any23.apache.org/getting-started.html)</description>
+</property>
+
+<property>
+ <name>any23.content_types</name>
+ <value>text/html,application/xhtml+xml</value>
+ <description>Comma-separated list of content-types onto which Any23 extractors should be applied (see http://www.iana.org/assignments/media-types/). If empty, all content-types are supported.</description>
+</property>
+
+<!-- moreindexingfilter plugin properties -->
+
+<property>
+ <name>moreIndexingFilter.indexMimeTypeParts</name>
+ <value>true</value>
+ <description>Determines whether the index-more plugin will split the mime-type
+ in sub parts, this requires the type field to be multi valued. Set to true for backward
+ compatibility. False will not split the mime-type.
+ </description>
+</property>
+
+<property>
+ <name>moreIndexingFilter.mapMimeTypes</name>
+ <value>false</value>
+ <description>Determines whether MIME-type mapping is enabled. It takes a
+ plain text file with mapped MIME-types. With it the user can map both
+ application/xhtml+xml and text/html to the same target MIME-type so it
+ can be treated equally in an index. See conf/contenttype-mapping.txt.
+ </description>
+</property>
+
+<property>
+ <name>moreIndexingFilter.mapMimeTypes.field</name>
+ <value></value>
+ <description>It's used if moreIndexingFilter.mapMimeTypes is true. Indicates the field
+ where the mapped MIME-type must be written. If it's empty or unset, the content of the field "type"
+ will be replaced by the mapped MIME-type.
+ </description>
+</property>
+
+<!-- AnchorIndexing filter plugin properties -->
+
+<property>
+ <name>anchorIndexingFilter.deduplicate</name>
+ <value>false</value>
+ <description>With this enabled the indexer will case-insensitive deduplicate anchors
+ before indexing. This prevents possible hundreds or thousands of identical anchors for
+ a given page to be indexed but will affect the search scoring (i.e. tf=1.0f).
+ </description>
+</property>
+
+<!-- indexingfilter plugin properties -->
+
+<property>
+ <name>indexingfilter.order</name>
+ <value></value>
+ <description>The order by which index filters are applied.
+ If empty, all available index filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order. For example, if this property has value:
+ org.apache.nutch.indexer.basic.BasicIndexingFilter org.apache.nutch.indexer.more.MoreIndexingFilter
+ then BasicIndexingFilter is applied first, and MoreIndexingFilter second.
+
+ Filter ordering might have impact on result if one filter depends on output of
+ another filter.
+ </description>
+</property>
+
+<property>
+ <name>indexer.score.power</name>
+ <value>0.5</value>
+ <description>Determines the power of link analyis scores. The boost
+ of each page is set to <i>score<sup>scorePower</sup></i> where
+ <i>score</i> is its link analysis score and <i>scorePower</i> is the
+ value of this parameter. This is compiled into indexes, so, when
+ this is changed, pages must be re-indexed for it to take
+ effect.</description>
+</property>
+
+<property>
+ <name>indexer.max.title.length</name>
+ <value>100</value>
+ <description>The maximum number of characters of a title that are indexed. A value of -1 disables this check.
+ </description>
+</property>
+
+<property>
+ <name>indexer.max.content.length</name>
+ <value>-1</value>
+ <description>The maximum number of characters of a content that are indexed.
+ Content beyond the limit is truncated. A value of -1 disables this check.
+ </description>
+</property>
+
+<property>
+ <name>indexer.add.domain</name>
+ <value>false</value>
+ <description>Whether to add the domain field to a NutchDocument.</description>
+</property>
+
+<property>
+ <name>indexer.skip.notmodified</name>
+ <value>false</value>
+ <description>Whether the indexer will skip records with a db_notmodified status.
+ </description>
+</property>
+
+<property>
+ <name>indexer.delete.robots.noindex</name>
+ <value>false</value>
+ <description>Whether the indexer will delete documents marked by robots=noindex
+ </description>
+</property>
+
+<property>
+ <name>indexer.delete.skipped.by.indexingfilter</name>
+ <value>false</value>
+ <description>Whether the indexer will delete documents that were skipped by indexing filters
+ </description>
+</property>
+
+<property>
+ <name>indexer.indexwriters.file</name>
+ <value>index-writers.xml</value>
+ <description>The configuration file for index writers.</description>
+</property>
+
+<!-- Exchanges properties -->
+
+<property>
+ <name>exchanges.exchanges.file</name>
+ <value>exchanges.xml</value>
+ <description>The configuration file used by the Exchange component.</description>
+</property>
+
+<!-- URL normalizer properties -->
+
+<property>
+ <name>urlnormalizer.order</name>
+ <value>org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer</value>
+ <description>Order in which normalizers will run. If any of these isn't
+ activated it will be silently skipped. If other normalizers not on the
+ list are activated, they will run in random order after the ones
+ specified here are run.
+ </description>
+</property>
+
+<property>
+ <name>urlnormalizer.regex.file</name>
+ <value>regex-normalize.xml</value>
+ <description>Name of the config file used by the RegexUrlNormalizer class.
+ </description>
+</property>
+
+<property>
+ <name>urlnormalizer.loop.count</name>
+ <value>1</value>
+ <description>Optionally loop through normalizers several times, to make
+ sure that all transformations have been performed.
+ </description>
+</property>
+
+<property>
+ <name>urlnormalizer.basic.host.idn</name>
+ <value></value>
+ <description>Let urlnormalizer-basic
+ (org.apache.nutch.net.urlnormalizer.basic.BasicURLNormalizer)
+ normalize Internationalized Domain Names (IDNs). Possible values
+ are: `toAscii` - convert the Unicode form to the ASCII (Punycode)
+ representation, `toUnicode` - convert ASCII (Punycode) to Unicode,
+ or if left empty no normalization of IDNs is performed.
+ </description>
+</property>
+
+<property>
+ <name>urlnormalizer.basic.host.trim-trailing-dot</name>
+ <value>false</value>
+ <description>urlnormalizer-basic: Trim a trailing dot in host names:
+ `https://example.org./` is normalized to `https://example.org/`.
+ </description>
+</property>
+
+<!-- mime properties -->
+
+<!--
+<property>
+ <name>mime.types.file</name>
+ <value>tika-mimetypes.xml</value>
+ <description>Name of file in CLASSPATH containing filename extension and
+ magic sequence to mime types mapping information. Overrides the default Tika config
+ if specified.
+ </description>
+</property>
+-->
+
+<property>
+ <name>mime.type.magic</name>
+ <value>true</value>
+ <description>Defines if the mime content type detector uses magic resolution.
+ </description>
+</property>
+
+<!-- plugin properties -->
+
+<property>
+ <name>plugin.folders</name>
+ <value>plugins</value>
+ <description>Directories where Nutch plugins are located. Each
+ element may be a relative or absolute path. If absolute, it is used
+ as is. If relative, it is searched for on the classpath.</description>
+</property>
+
+<property>
+ <name>plugin.auto-activation</name>
+ <value>true</value>
+ <description>Defines if some plugins that are not activated regarding
+ the plugin.includes and plugin.excludes properties must be automatically
+ activated if they are needed by some active plugins.
+ </description>
+</property>
+
+<property>
+ <name>plugin.includes</name>
+ <value>protocol-http|urlfilter-(regex|validator)|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)</value>
+ <description>Regular expression naming plugin directory names to
+ include. Any plugin not matching this expression is excluded.
+ By default Nutch includes plugins to crawl HTML and various other
+ document formats via HTTP/HTTPS and indexing the crawled content
+ into Solr. More plugins are available to support more indexing
+ backends, to fetch ftp:// and file:// URLs, for focused crawling,
+ and many other use cases.
+ </description>
+</property>
+
+<property>
+ <name>plugin.excludes</name>
+ <value></value>
+ <description>Regular expression naming plugin directory names to exclude.
+ </description>
+</property>
+
+<property>
+ <name>urlmeta.tags</name>
+ <value></value>
+ <description>
+ To be used in conjunction with features introduced in NUTCH-655, which allows
+ for custom metatags to be injected alongside your crawl URLs. Specifying those
+ custom tags here will allow for their propagation into a pages outlinks, as
+ well as allow for them to be included as part of an index.
+ Values should be comma-delimited. ("tag1,tag2,tag3") Do not pad the tags with
+ white-space at their boundaries, if you are using anything earlier than Hadoop-0.21.
+ </description>
+</property>
+
+<!-- parser properties -->
+
+<property>
+ <name>parse.plugin.file</name>
+ <value>parse-plugins.xml</value>
+ <description>The name of the file that defines the associations between
+ content-types and parsers.</description>
+</property>
+
+<property>
+ <name>parser.character.encoding.default</name>
+ <value>windows-1252</value>
+ <description>The character encoding to fall back to when no other information
+ is available</description>
+</property>
+
+<property>
+ <name>encodingdetector.charset.min.confidence</name>
+ <value>-1</value>
+ <description>A integer between 0-100 indicating minimum confidence value
+ for charset auto-detection. Any negative value disables auto-detection.
+ </description>
+</property>
+
+<property>
+ <name>parser.caching.forbidden.policy</name>
+ <value>content</value>
+ <description>If a site (or a page) requests through its robot metatags
+ that it should not be shown as cached content, apply this policy. Currently
+ three keywords are recognized: "none" ignores any "noarchive" directives.
+ "content" doesn't show the content, but shows summaries (snippets).
+ "all" doesn't show either content or summaries.</description>
+</property>
+
+<property>
+ <name>parser.html.impl</name>
+ <value>neko</value>
+ <description>HTML Parser implementation. Currently the following keywords
+ are recognized: "neko" uses NekoHTML, "tagsoup" uses TagSoup.
+ </description>
+</property>
+
+<property>
+ <name>parser.html.form.use_action</name>
+ <value>false</value>
+ <description>If true, HTML parser will collect URLs from form action
+ attributes. This may lead to undesirable behavior (submitting empty
+ forms during next fetch cycle). If false, form action attribute will
+ be ignored.</description>
+</property>
+
+<property>
+ <name>parser.html.outlinks.ignore_tags</name>
+ <value></value>
+ <description>Comma separated list of HTML tags, from which outlinks
+ shouldn't be extracted. Nutch takes links from: a, area, form, frame,
+ iframe, script, link, img. If you add any of those tags here, it
+ won't be taken. Default is empty list. Probably reasonable value
+ for most people would be "img,script,link".</description>
+</property>
+
+<property>
+ <name>parser.html.outlinks.htmlnode_metadata_name</name>
+ <value></value>
+ <description>if not empty, the source nodename of a found outlink will
+ be set in the metadata with this name into the outlink</description>
+</property>
+
+<property>
+ <name>parser.html.line.separators</name>
+ <value>article,aside,blockquote,canvas,dd,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,header,hr,li,main,nav,noscript,ol,output,p,pre,section,table,tfoot,ul,video</value>
+ <description>Comma separated list of HTML tags. Newline will be added to the
+ parsed text after these tages.
+ The default list above are the block-level HTML elements.
+ Tags must be in lower case.
+ To disable this feature, leave the list empty.</description>
+</property>
+
+<property>
+ <name>htmlparsefilter.order</name>
+ <value></value>
+ <description>The order by which HTMLParse filters are applied.
+ If empty, all available HTMLParse filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order.
+ HTMLParse filter ordering MAY have an impact
+ on end result, as some filters could rely on the metadata generated by a previous filter.
+ </description>
+</property>
+
+<property>
+ <name>parsefilter.naivebayes.trainfile</name>
+ <value>naivebayes-train.txt</value>
+ <description>Set the name of the file to be used for Naive Bayes training. The format will be:
+Each line contains two tab separated parts
+There are two columns/parts:
+1. "1" or "0", "1" for relevant and "0" for irrelevant documents.
+2. Text (text that will be used for training)
+
+Each row will be considered a new "document" for the classifier.
+CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this classifier.
+ </description>
+</property>
+
+<property>
+ <name>parsefilter.naivebayes.wordlist</name>
+ <value>naivebayes-wordlist.txt</value>
+ <description>Put the name of the file you want to be used as a list of
+ important words to be matched in the url for the model filter. The format should be one word per line.
+ </description>
+</property>
+
+<property>
+ <name>parser.timeout</name>
+ <value>30</value>
+ <description>Timeout in seconds for the parsing of a document, otherwise treats it as an exception and
+ moves on the the following documents. This parameter is applied to any Parser implementation.
+ Set to -1 to deactivate, bearing in mind that this could cause
+ the parsing to crash because of a very long or corrupted document.
+ </description>
+</property>
+
+<property>
+ <name>parse.filter.urls</name>
+ <value>true</value>
+ <description>Whether the parser will filter URLs (with the configured URL filters).</description>
+</property>
+
+<property>
+ <name>parse.normalize.urls</name>
+ <value>true</value>
+ <description>Whether the parser will normalize URLs (with the configured URL normalizers).</description>
+</property>
+
+<property>
+ <name>parser.skip.truncated</name>
+ <value>true</value>
+ <description>Boolean value for whether we should skip parsing for truncated documents. By default this
+ property is activated due to extremely high levels of CPU which parsing can sometimes take.
+ </description>
+</property>
+
+<property>
+ <name>parser.store.text</name>
+ <value>true</value>
+ <description>If true (default value), parser will store parse text (parse_text directory within the segment).</description>
+</property>
+
+
+<!--
+<property>
+ <name>tika.htmlmapper.classname</name>
+ <value>org.apache.tika.parser.html.IdentityHtmlMapper</value>
+ <description>Classname of Tika HTMLMapper to use. Influences the elements included in the DOM and hence
+ the behavior of the HTMLParseFilters.
+ </description>
+</property>
+-->
+
+<property>
+ <name>tika.config.file</name>
+ <value>tika-config.xml</value>
+ <description>Nutch-specific Tika config file</description>
+</property>
+
+<property>
+ <name>tika.uppercase.element.names</name>
+ <value>true</value>
+ <description>Determines whether TikaParser should uppercase the element name while generating the DOM
+ for a page, as done by Neko (used per default by parse-html)(see NUTCH-1592).
+ </description>
+</property>
+
+<property>
+ <name>tika.extractor</name>
+ <value>none</value>
+ <description>
+ Which text extraction algorithm to use. Valid values are: boilerpipe or none.
+ </description>
+</property>
+
+<property>
+ <name>tika.extractor.boilerpipe.algorithm</name>
+ <value>ArticleExtractor</value>
+ <description>
+ Which Boilerpipe algorithm to use. Valid values are: DefaultExtractor, ArticleExtractor
+ or CanolaExtractor.
+ </description>
+</property>
+
+<property>
+ <name>tika.extractor.boilerpipe.mime.types</name>
+ <value>text/html,application/xhtml+xml</value>
+ <description>
+ Comma-separated list of MIME types accepted for Boilerpipe extraction,
+ documents of other MIME types are not passed to the Boilerpipe extractor.
+ </description>
+</property>
+
+<property>
+ <name>tika.parse.embedded</name>
+ <value>true</value>
+ <description>
+ Whether parse-tika shall parse embedded documents (even recursively).
+ </description>
+</property>
+
+<!-- urlfilter plugin properties -->
+
+<property>
+ <name>urlfilter.domain.file</name>
+ <value>domain-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing either top level domains or
+ hostnames used by urlfilter-domain (DomainURLFilter) plugin.</description>
+</property>
+
+<property>
+ <name>urlfilter.regex.file</name>
+ <value>regex-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing regular expressions
+ used by urlfilter-regex (RegexURLFilter) plugin.</description>
+</property>
+
+<property>
+ <name>urlfilter.automaton.file</name>
+ <value>automaton-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing regular expressions
+ used by urlfilter-automaton (AutomatonURLFilter) plugin.</description>
+</property>
+
+<property>
+ <name>urlfilter.prefix.file</name>
+ <value>prefix-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing url prefixes
+ used by urlfilter-prefix (PrefixURLFilter) plugin.</description>
+</property>
+
+<property>
+ <name>urlfilter.suffix.file</name>
+ <value>suffix-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing url suffixes
+ used by urlfilter-suffix (SuffixURLFilter) plugin.</description>
+</property>
+
+<property>
+ <name>urlfilter.fast.file</name>
+ <value>fast-urlfilter.txt</value>
+ <description>Name of file on CLASSPATH containing regular expressions
+ used by urlfilter-fast (FastURLFilter) plugin.</description>
+</property>
+
+<property>
+ <name>urlfilter.order</name>
+ <value></value>
+ <description>The order by which URL filters are applied.
+ If empty, all available url filters (as dictated by properties
+ plugin-includes and plugin-excludes above) are loaded and applied in system
+ defined order. If not empty, only named filters are loaded and applied
+ in given order. For example, if this property has value:
+ org.apache.nutch.urlfilter.regex.RegexURLFilter org.apache.nutch.urlfilter.prefix.PrefixURLFilter
+ then RegexURLFilter is applied first, and PrefixURLFilter second.
+ Since all filters are AND'ed, filter ordering does not have impact
+ on end result, but it may have performance implication, depending
+ on relative expensiveness of filters.
+ </description>
+</property>
+
+<!-- scoring filters properties -->
+
+<property>
+ <name>scoring.filter.order</name>
+ <value></value>
+ <description>The order in which scoring filters are applied. This
+ may be left empty (in which case all available scoring filters will
+ be applied in system defined order), or a space separated list of
+ implementation classes.
+ </description>
+</property>
+
+<!-- scoring-depth properties
+ Add 'scoring-depth' to the list of active plugins
+ in the parameter 'plugin.includes' in order to use it.
+ -->
+
+<property>
+ <name>scoring.depth.max</name>
+ <value>1000</value>
+ <description>Max depth value from seed allowed by default.
+ Can be overridden on a per-seed basis by specifying "_maxdepth_=VALUE"
+ as a seed metadata. This plugin adds a "_depth_" metadatum to the pages
+ to track the distance from the seed it was found from.
+ The depth is used to prioritise URLs in the generation step so that
+ shallower pages are fetched first.
+ </description>
+</property>
+
+<!-- scoring similarity properties
+Add scoring-similarity to the list of active plugins
+ in the parameter 'plugin.includes' in order to use it.
+For more detailed information on the working of this filter
+visit https://cwiki.apache.org/confluence/display/NUTCH/SimilarityScoringFilter -->
+
+<property>
+ <name>scoring.similarity.model</name>
+ <value>cosine</value>
+ <description>The type of similarity metric to use. Eg - cosine (which is, currently, the only available model).
+ Please make sure to set the model specific properties for the scoring to function properly.
+ Description of these properties can be found on the wiki.
+ </description>
+</property>
+
+ <property>
+ <name>scoring.similarity.ngrams</name>
+ <value>1,1</value>
+ <description>Specifies the min 'n' and max 'n' in ngrams as comma-separated.
+ If one value is specified as 'n', it will be used for both the min 'n' and max 'n' in ngrams.
+ </description>
+</property>
+
+<property>
+ <name>cosine.goldstandard.file</name>
+ <value>goldstandard.txt</value>
+ <description>Path to the gold standard file which contains all the relevant text and terms,
+ pertaining to the domain.
+ </description>
+</property>
+
+ <property>
+ <name>scoring.similarity.stopword.file</name>
+ <value>stopwords.txt</value>
+ <description>Name of the stopword text file. The user can specify a custom list of stop words
+ in a text file. Each new stopword should be on a new line.
+ </description>
+</property>
+
+<!-- scoring filter orphan properties -->
+
+<property>
+ <name>scoring.orphan.mark.gone.after</name>
+ <value>2592000</value>
+ <description>Time in seconds after which orphaned
+ pages are marked as gone. Default is 30 days.
+ </description>
+</property>
+
+<property>
+ <name>scoring.orphan.mark.orphan.after</name>
+ <value>3456000</value>
+ <description>Time in seconds after which orphaned
+ pages are marked as gone. Default is 40 days.
+ </description>
+</property>
+
+<!-- language-identifier plugin properties -->
+
+<property>
+ <name>lang.analyze.max.length</name>
+ <value>2048</value>
+ <description> The maximum number of bytes used to identify
+ the language (0 means full content analysis).
+ The larger is this value, the better is the analysis, but the
+ slowest it is.
+ </description>
+</property>
+
+<property>
+ <name>lang.extraction.policy</name>
+ <value>detect,identify</value>
+ <description>This determines when the plugin uses detection and
+ statistical identification mechanisms. The order in which the
+ detect and identify are written will determine the extraction
+ policy. Default case (detect,identify) means the plugin will
+ first try to extract language info from page headers and metadata,
+ if this is not successful it will try using tika language
+ identification. Possible values are:
+ detect
+ identify
+ detect,identify
+ identify,detect
+ </description>
+</property>
+
+<property>
+ <name>lang.identification.only.certain</name>
+ <value>false</value>
+ <description>If set to true with lang.extraction.policy containing identify,
+ the language code returned by Tika will be assigned to the document ONLY
+ if it is deemed certain by Tika.
+ </description>
+</property>
+
+<property>
+ <name>lang.index.languages</name>
+ <value></value>
+ <description>If not empty, should be a comma separated list of language codes.
+ Only documents with one of these language codes will be indexed.
+ "unknown" is a valid language code, will match documents where language
+ detection failed.
+ </description>
+</property>
+
+<!-- index-jexl-filter plugin properties -->
+
+<property>
+ <name>index.jexl.filter</name>
+ <value></value>
+ <description> A JEXL expression. If it evaluates to false,
+ the document will not be indexed.
+ Available primitives in the JEXL context:
+ * status, fetchTime, modifiedTime, retries, interval, score, signature, url, text, title
+ Available objects in the JEXL context:
+ * httpStatus - contains majorCode, minorCode, message
+ * documentMeta, contentMeta, parseMeta - contain all the Metadata properties.
+ each property value is always an array of Strings (so if you expect one value, use [0])
+ * doc - contains all the NutchFields from the NutchDocument.
+ each property value is always an array of Objects.
+ </description>
+</property>
+
+<!-- index-static plugin properties -->
+
+<property>
+ <name>index.static</name>
+ <value></value>
+ <description>
+ Used by plugin index-static to adds fields with static data at indexing time.
+ You can specify a comma-separated list of fieldname:fieldcontent per Nutch job.
+ Each fieldcontent can have multiple values separated by space, e.g.,
+ field1:value1.1 value1.2 value1.3,field2:value2.1 value2.2 ...
+ It can be useful when collections can't be created by URL patterns,
+ like in subcollection, but on a job-basis.
+ </description>
+</property>
+
+<property>
+ <name>index.static.fieldsep</name>
+ <value>,</value>
+ <description>
+ Used by plugin index-static to parse the property index.static. Default: comma.
+ This delimiter is used to separate individual field specifications in the property.
+ </description>
+</property>
+
+<property>
+ <name>index.static.keysep</name>
+ <value>:</value>
+ <description>
+ Used by plugin index-static to parse the property index.static. Default: colon.
+ This delimiter is used to separate the field name from the field value in the field specification.
+ </description>
+</property>
+
+<property>
+ <name>index.static.valuesep</name>
+ <value> </value>
+ <description>
+ Used by plugin index-static to parse the property index.static. Default: space.
+ This delimiter is used to separate multiple field values in the value setting of the field specification.
+ </description>
+</property>
+
+
+<!-- index-metadata plugin properties -->
+
+<property>
+ <name>index.parse.md</name>
+ <value>metatag.description,metatag.keywords</value>
+ <description>
+ Comma-separated list of keys to be taken from the parse metadata to generate fields.
+ Can be used e.g. for 'description' or 'keywords' provided that these values are generated
+ by a parser (see parse-metatags plugin)
+ </description>
+</property>
+
+<property>
+ <name>index.content.md</name>
+ <value></value>
+ <description>
+ Comma-separated list of keys to be taken from the content metadata to generate fields.
+ </description>
+</property>
+
+<property>
+ <name>index.db.md</name>
+ <value></value>
+ <description>
+ Comma-separated list of keys to be taken from the crawldb metadata to generate fields.
+ Can be used to index values propagated from the seeds with the plugin urlmeta
+ </description>
+</property>
+
+<property>
+ <name>index.metadata.separator</name>
+ <value></value>
+ <description>
+ Separator to use if you want to index multiple values for a given field. Leave empty to
+ treat each value as a single value.
+ </description>
+</property>
+
+<!-- index-geoip plugin properties -->
+<property>
+ <name>index.geoip.usage</name>
+ <value>insightsService</value>
+ <description>
+ A string representing the information source to be used for GeoIP information
+ association. Either enter 'cityDatabase', 'connectionTypeDatabase',
+ 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
+ Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
+ GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the classpath and
+ available at runtime.
+ </description>
+</property>
+
+<property>
+ <name>index.geoip.userid</name>
+ <value></value>
+ <description>
+ The userId associated with the GeoIP2 Precision Services account.
+ </description>
+</property>
+
+<property>
+ <name>index.geoip.licensekey</name>
+ <value></value>
+ <description>
+ The license key associated with the GeoIP2 Precision Services account.
+ </description>
+</property>
+
+<property>
+ <name>index.replace.regexp</name>
+ <value/>
+ <description>Allows indexing-time regexp replace manipulation of metadata fields.
+ The format of the property is a list of regexp replacements, one line per field being
+ modified. Include index-replace in your plugin.includes.
+
+ Example:
+ hostmatch=.*somedomain.com
+ fldname1=/regexp/replacement/flags
+ fldname2=/regexp/replacement/flags
+
+ Field names would be one of those from
+ https://cwiki.apache.org/confluence/display/NUTCH/IndexStructure
+ See https://cwiki.apache.org/confluence/display/NUTCH/IndexReplace for further details.
+ </description>
+</property>
+
+<!-- parse-metatags plugin properties -->
+<property>
+ <name>metatags.names</name>
+ <value>description,keywords</value>
+ <description> Names of the metatags to extract, separated by ','.
+ Use '*' to extract all metatags. Prefixes the names with 'metatag.'
+ in the parse-metadata. For instance to index description and keywords,
+ you need to activate the plugin index-metadata and set the value of the
+ parameter 'index.parse.md' to 'metatag.description,metatag.keywords'.
+ </description>
+</property>
+
+<!-- Temporary Hadoop 0.17.x workaround. -->
+
+<property>
+ <name>hadoop.job.history.user.location</name>
+ <value>${hadoop.log.dir}/history/user</value>
+ <description>Hadoop 0.17.x comes with a default setting to create
+ user logs inside the output path of the job. This breaks some
+ Hadoop classes, which expect the output to contain only
+ part-XXXXX files. This setting changes the output to a
+ subdirectory of the regular log directory.
+ </description>
+</property>
+
+<property>
+ <name>io.serializations</name>
+ <value>org.apache.hadoop.io.serializer.WritableSerialization,org.apache.hadoop.io.serializer.JavaSerialization</value>
+ <!-- org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroReflectSerialization,
+ org.apache.hadoop.io.serializer.avro.AvroGenericSerialization, -->
+ <description>A list of serialization classes that can be used for
+ obtaining serializers and deserializers.</description>
+</property>
+
+<!-- linkrank scoring properties -->
+
+<property>
+ <name>link.ignore.internal.host</name>
+ <value>true</value>
+ <description>Ignore outlinks to the same hostname.</description>
+</property>
+
+<property>
+ <name>link.ignore.internal.domain</name>
+ <value>true</value>
+ <description>Ignore outlinks to the same domain.</description>
+</property>
+
+<property>
+ <name>link.ignore.limit.page</name>
+ <value>true</value>
+ <description>Limit to only a single outlink to the same page.</description>
+</property>
+
+<property>
+ <name>link.ignore.limit.domain</name>
+ <value>true</value>
+ <description>Limit to only a single outlink to the same domain.</description>
+</property>
+
+<property>
+ <name>link.analyze.num.iterations</name>
+ <value>10</value>
+ <description>The number of LinkRank iterations to run.</description>
+</property>
+
+<property>
+ <name>link.analyze.initial.score</name>
+ <value>1.0f</value>
+ <description>The initial score.</description>
+</property>
+
+<property>
+ <name>link.analyze.damping.factor</name>
+ <value>0.85f</value>
+ <description>The damping factor.</description>
+</property>
+
+<property>
+ <name>link.delete.gone</name>
+ <value>false</value>
+ <description>Whether to delete gone pages from the web graph.</description>
+</property>
+
+<property>
+ <name>link.loops.depth</name>
+ <value>2</value>
+ <description>The depth for the loops algorithm.</description>
+</property>
+
+<property>
+ <name>link.score.updater.clear.score</name>
+ <value>0.0f</value>
+ <description>The default score for URLs that are not in the web graph.</description>
+</property>
+
+<property>
+ <name>mapreduce.fileoutputcommitter.marksuccessfuljobs</name>
+ <value>false</value>
+ <description>Hadoop >= 0.21 generates SUCCESS files in the output which can crash
+ the readers. This should not be an issue once Nutch is ported to the new MapReduce API
+ but for now this parameter should prevent such cases.
+ </description>
+</property>
+
+<!-- subcollection properties -->
+
+<property>
+ <name>subcollection.default.fieldname</name>
+ <value>subcollection</value>
+ <description>
+ The default field name for the subcollections.
+ </description>
+</property>
+
+<property>
+ <name>subcollection.case.insensitive</name>
+ <value>false</value>
+ <description>
+ Whether the URL prefixes are to be treated case insensitive.
+ </description>
+</property>
+
+<!-- Headings plugin properties -->
+
+<property>
+ <name>headings</name>
+ <value>h1,h2</value>
+ <description>Comma separated list of headings to retrieve from the document</description>
+</property>
+
+<property>
+ <name>headings.multivalued</name>
+ <value>false</value>
+ <description>Whether to support multivalued headings.</description>
+</property>
+
+<!-- mimetype-filter plugin properties -->
+
+<property>
+ <name>mimetype.filter.file</name>
+ <value>mimetype-filter.txt</value>
+ <description>
+ The configuration file for the mimetype-filter plugin. This file contains
+ the rules used to allow or deny the indexing of certain documents.
+ </description>
+</property>
+
+<!-- plugin properties that applies to lib-selenium, protocol-selenium,
+ protocol-interactiveselenium, lib-htmlunit, protocol-htmlunit -->
+
+<property>
+ <name>page.load.delay</name>
+ <value>3</value>
+ <description>
+ The delay in seconds to use when loading a page with htmlunit or selenium.
+ </description>
+</property>
+
+<property>
+ <name>take.screenshot</name>
+ <value>false</value>
+ <description>
+ Boolean property determining whether the protocol-htmlunit
+ WebDriver should capture a screenshot of the URL. If set to
+ true remember to define the 'screenshot.location'
+ property as this determines the location screenshots should be
+ persisted to on HDFS. If that property is not set, screenshots
+ are simply discarded.
+ </description>
+</property>
+
+<property>
+ <name>screenshot.location</name>
+ <value></value>
+ <description>
+ The location on disk where a URL screenshot should be saved
+ to if the 'take.screenshot' property is set to true.
+ By default this is null, in this case screenshots held in memory
+ are simply discarded.
+ </description>
+</property>
+
+<!-- lib-htmlunit plugin properties; applies to protocol-htmlunit -->
+
+<property>
+ <name>htmlunit.enable.javascript</name>
+ <value>true</value>
+ <description>
+ A Boolean value representing if javascript should
+ be enabled or disabled when using htmlunit. The default value is enabled.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.javascript.timeout</name>
+ <value>3500</value>
+ <description>
+ The timeout in milliseconds when loading javascript with lib-htmlunit. This
+ setting is used by protocol-htmlunit since they depending on
+ lib-htmlunit for fetching.
+ </description>
+</property>
+
+<property>
+ <name>htmlunit.enable.css</name>
+ <value>false</value>
+ <description>
+ A Boolean value representing if CSS should
+ be enabled or disabled when using htmlunit. The default value is disabled.
+ </description>
+</property>
+
+<!-- protocol-selenium plugin properties -->
+
+<property>
+ <name>selenium.driver</name>
+ <value>firefox</value>
+ <description>
+ A String value representing the flavour of Selenium
+ WebDriver() to use. Currently the following options
+ exist - 'firefox', 'chrome', 'safari', 'opera' and 'remote'.
+ If 'remote' is used it is essential to also set correct properties for
+ 'selenium.hub.port', 'selenium.hub.path', 'selenium.hub.host',
+ 'selenium.hub.protocol', 'selenium.grid.driver', 'selenium.grid.binary'
+ and 'selenium.enable.headless'.
+ </description>
+</property>
+
+<property>
+ <name>selenium.hub.port</name>
+ <value>4444</value>
+ <description>Selenium Hub Location connection port</description>
+</property>
+
+<property>
+ <name>selenium.hub.path</name>
+ <value>/wd/hub</value>
+ <description>Selenium Hub Location connection path</description>
+</property>
+
+<property>
+ <name>selenium.hub.host</name>
+ <value>localhost</value>
+ <description>Selenium Hub Location connection host</description>
+</property>
+
+<property>
+ <name>selenium.hub.protocol</name>
+ <value>http</value>
+ <description>Selenium Hub Location connection protocol</description>
+</property>
+
+<property>
+ <name>selenium.grid.driver</name>
+ <value>firefox</value>
+ <description>A String value representing the flavour of Selenium
+ WebDriver() used on the selenium grid. We must set `selenium.driver` to `remote` first.
+ Currently the following options
+ exist - 'firefox', 'chrome', 'random' </description>
+</property>
+
+<property>
+ <name>selenium.grid.binary</name>
+ <value></value>
+ <description>A String value representing the path to the browser binary
+ location for each node
+ </description>
+</property>
+
+<!-- headless options for Firefox and Chrome-->
+<property>
+ <name>selenium.enable.headless</name>
+ <value>false</value>
+ <description>A Boolean value representing the headless option
+ for Firefix and Chrome drivers
+ </description>
+</property>
+<!-- selenium firefox configuration;
+ applies to protocol-selenium and protocol-interactiveselenium plugins -->
+<property>
+ <name>selenium.firefox.allowed.hosts</name>
+ <value>localhost</value>
+ <description>A String value representing the allowed hosts preference
+ according to the operating system hosts file (Example - /etc/hosts in Unix).
+ Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+ <name>selenium.firefox.binary.timeout</name>
+ <value>45</value>
+ <description>A Long value representing the timeout value
+ for firefox to be available for command execution. The value is in seconds.
+ Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+ <name>selenium.firefox.enable.flash</name>
+ <value>false</value>
+ <description>A Boolean value representing if flash should
+ be enabled or disabled. The default value is disabled.
+ Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+ <name>selenium.firefox.load.image</name>
+ <value>1</value>
+ <description>An Integer value representing the restriction on
+ loading images. The default value is no restriction i.e. load all images.
+ Other options are:
+ 1: Load all images, regardless of origin
+ 2: Block all images
+ 3: Prevent third-party images from loading
+ Currently this option exist for - 'firefox' </description>
+</property>
+
+<property>
+ <name>selenium.firefox.load.stylesheet</name>
+ <value>1</value>
+ <description>An Integer value representing the restriction on
+ loading stylesheet. The default value is no restriction i.e. load
+ all stylesheet.
+ Other options are:
+ 1: Load all stylesheet
+ 2: Block all stylesheet
+ Currently this option exist for - 'firefox' </description>
+</property>
+
+<!-- selenium chrome configurations -->
+<property>
+ <name>webdriver.chrome.driver</name>
+ <value>/root/chromedriver</value>
+ <description>The path to the ChromeDriver binary</description>
+</property>
+<!-- end of selenium chrome configurations -->
+
+<!-- protocol-interactiveselenium configuration -->
+<property>
+ <name>interactiveselenium.handlers</name>
+ <value>DefaultHandler</value>
+ <description>
+ A comma separated list of Selenium handlers that should be run for a given
+ URL. The DefaultHandler causes the same functionality as protocol-selenium.
+ Custom handlers can be implemented in the plugin package and included here.
+ </description>
+</property>
+
+<property>
+ <name>store.http.request</name>
+ <value>false</value>
+ <description>
+ Store the raw request made by Nutch, required to use the CommonCrawlDataDumper
+ tool for the WARC format.
+ </description>
+</property>
+
+<property>
+ <name>store.http.headers</name>
+ <value>false</value>
+ <description>
+ Store the raw headers received by Nutch from the server, required to use the
+ CommonCrawlDataDumper tool for the WARC format.
+ </description>
+</property>
+
+<!-- index-links plugin -->
+
+<property>
+ <name>index.links.outlinks.host.ignore</name>
+ <value>false</value>
+ <description>
+ Ignore outlinks that point out to the same host as the URL being indexed.
+ By default all outlinks are indexed. If db.ignore.internal.links is true (default
+ value), this setting does nothing since the internal links are already
+ ignored.
+ </description>
+</property>
+
+<property>
+ <name>index.links.inlinks.host.ignore</name>
+ <value>false</value>
+ <description>
+ Ignore inlinks coming from the same host as the URL being indexed. By default
+ all inlinks are indexed. If db.ignore.internal.links is true (default
+ value), this setting does nothing since the internal links are already
+ ignored.
+ </description>
+</property>
+
+<property>
+ <name>index.links.hosts.only</name>
+ <value>false</value>
+ <description>
+ This force the index-links plugin to only index the host portion of the inlinks
+ or outlinks.
+ </description>
+</property>
+
+<!-- HostDB settings -->
+<property>
+ <name>hostdb.recheck.interval</name>
+ <value>86400000</value>
+ <description>
+ Interval between rechecks in milliseconds. Default is one week. Recheck
+ interval is multiplied by the number of DNS lookup failures for a given
+ host.
+ </description>
+</property>
+
+<property>
+ <name>hostdb.purge.failed.hosts.threshold</name>
+ <value>3</value>
+ <description>
+ If hosts have more failed DNS lookups than this threshold, they are
+ removed from the HostDB. Hosts can, of course, return if they are still
+ present in the CrawlDB.
+ </description>
+</property>
+
+<property>
+ <name>hostdb.num.resolvers.threads</name>
+ <value>25</value>
+ <description>
+ Number of resolver threads per reducer. Make sure your DNS resolver is
+ capable of handling this value multiplied by the number of reducers.
+ </description>
+</property>
+
+<property>
+ <name>hostdb.check.failed</name>
+ <value>true</value>
+ <description>
+ True if hosts for which DNS lookup failed are eligible for recheck. If
+ false, hosts that failed DNS lookup more than 0 times are not eligible
+ for DNS lookup.
+ </description>
+</property>
+
+<property>
+ <name>hostdb.check.new</name>
+ <value>true</value>
+ <description>
+ True if newly discovered hosts eligible for DNS lookup check. If false,
+ hosts that are just added to the HostDB are not eligible for DNS lookup.
+ </description>
+</property>
+
+<property>
+ <name>hostdb.check.known</name>
+ <value>true</value>
+ <description>
+ True if newly already known hosts eligible for DNS lookup check. If false,
+ known hosts are not eligible for DNS lookup.
+ </description>
+</property>
+
+<property>
+ <name>hostdb.force.check</name>
+ <value>false</value>
+ <description>
[... 236 lines stripped ...]