You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by "Chaushu, Shani" <sh...@intel.com> on 2016/05/26 13:06:29 UTC
optimize configuration

Hi,
I'm running nutch 1.9 on Hadoop & yarn, 3 nodes.
Is there anywhere guide with optimize configuration so the nutch will run the most efficient way?
Those are my current nutch-site:


<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<!-- Put site-specific property overrides in this file. -->

<configuration>


<property>
  <name>http.redirect.max</name>
  <value>5</value>
  <description>The maximum number of redirects the fetcher will follow when
  trying to fetch a page. If set to negative or 0, fetcher won't immediately
  follow redirected URLs, instead it will record them for later fetching.
  </description>
</property>

<property>
  <name>solr.commit.size</name>
  <value>10000</value>
  <description>
  Defines the number of documents to send to Solr in a single update batch.
  Decrease when handling very large documents to prevent Nutch from running
  out of memory. NOTE: It does not explicitly trigger a server side commit.
  </description>
</property>

   <property>
      <name>http.agent.name</name>
      <value>Crawler</value>
   </property>

   <!-- plugin for eclipse
   <property>
      <name>plugin.folders</name>
      <value>/opt/apache-nutch-1.9/plugins</value>
      <description />
   </property>
   -->

   <property>
      <name>db.ignore.external.links</name>
      <value>true</value>
      <description>If true, outlinks leading from a page to external hosts
         will be ignored. This is an effective way to limit the crawl to include
         only initially injected hosts, without creating complex URLFilters.
      </description>
   </property>

<property>
  <name>db.ignore.internal.links</name>
  <value>false</value>
  <description>If true, when adding new links to a page, links from
  the same host are ignored.  This is an effective way to limit the
  size of the link database, keeping only the highest quality
  links.
  </description>
</property>


   <property>
      <name>db.max.outlinks.per.page</name>
      <value>-1</value>
      <description>The maximum number of outlinks that we'll process for a page.
         If this value is nonnegative (>=0), at most db.max.outlinks.per.page outlinks
         will be processed for a page; otherwise, all outlinks will be processed.
      </description>
   </property>


   <property>
      <name>fetcher.threads.fetch</name>
      <value>100</value>
      <description>The number of FetcherThreads the fetcher should use.
         This is also determines the maximum number of requests that are
         made at once (each FetcherThread handles one connection). The total
         number of threads running in distributed mode will be the number of
         fetcher threads * number of nodes as fetcher has one map task per node.
      </description>
   </property>


   <property>
      <name>fetcher.queue.depth.multiplier</name>
      <value>150</value>
      <description>(EXPERT)The fetcher buffers the incoming URLs into queues based on the [host|domain|IP]
         see param fetcher.queue.mode). The depth of the queue is the number of threads times the value of this parameter.
         A large value requires more memory but can improve the performance of the fetch when the order of the URLS in the fetch list
         is not optimal.
      </description>
   </property>


   <property>
      <name>fetcher.threads.per.queue</name>
      <value>10</value>
       <description>This number is the maximum number of threads that
             should be allowed to access a queue at one time. Setting it to
             a value > 1 will cause the Crawl-Delay value from robots.txt to
             be ignored and the value of fetcher.server.min.delay to be used
             as a delay between successive requests to the same server instead
             of fetcher.server.delay.
          </description>
   </property>

   <property>
      <name>fetcher.server.min.delay</name>
      <value>0.0</value>
      <description>The minimum number of seconds the fetcher will delay between
         successive requests to the same server. This value is applicable ONLY
         if fetcher.threads.per.queue is greater than 1 (i.e. the host blocking
         is turned off).
      </description>
   </property>


   <property>
      <name>fetcher.max.crawl.delay</name>
      <value>5</value>
      <description>
         If the Crawl-Delay in robots.txt is set to greater than this value (in
         seconds) then the fetcher will skip this page, generating an error report.
         If set to -1 the fetcher will never skip such pages and will wait the
         amount of time retrieved from robots.txt Crawl-Delay, however long that
         might be.
      </description>
   </property>


<property>
  <name>generate.max.count</name>
  <value>-1</value>
  <description>The maximum number of urls in a single
  fetchlist.  -1 if unlimited. The urls are counted according
  to the value of the parameter generator.count.mode.
  </description>
</property>



<property>
  <name>plugin.includes</name>
 <value>protocol-selenium|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)|protocol-httpclient</value>

<!--
  <value>protocol-selenium|protocol-http|urlfilter-regex|parse-(html|tika)|index-(basic|anchor)|indexer-solr|scoring-opic|urlnormalizer-(pass|regex|basic)|language-identifier|protocol-httpclient</value>
-->
<description>Regular expression naming plugin directory names to
  include.  Any plugin not matching this expression is excluded.
  In any case you need at least include the nutch-extensionpoints plugin. By
  default Nutch includes crawling just HTML and plain text via HTTP,
  and basic indexing and search plugins. In order to use HTTPS please enable
  protocol-httpclient, but be aware of possible intermittent problems with the
  underlying commons-httpclient library.
  </description>
</property>

</configuration>


---------------------------------------------------------------------
Intel Electronics Ltd.

This e-mail and any attachments may contain confidential material for
the sole use of the intended recipient(s). Any review or distribution
by others is strictly prohibited. If you are not the intended
recipient, please contact the sender and delete all copies.