You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by devil devil <de...@gmx.com> on 2017/12/22 20:24:51 UTC

Nutch 2.x does not send index to ElasticSearch 2.3.3

Hello, 
    I am running nutch 2.x and elasticsearch 2.3.3 in two containers. I can log into nutch container and curl E.S. so connectivity is there. Inject/Fetch/etc all work fine. However when i get to nutch index elasticsearch, all i get is:
 
    root@b211135e1be5:~/nutch/bin# ./nutch index elasticsearch -all
    IndexingJob: starting
    Active IndexWriters :
    ElasticIndexWriter
         elastic.cluster : elastic prefix cluster
        elastic.host : hostname
        elastic.port : port  (default 9300)
        elastic.index : elastic index command 
        elastic.max.bulk.docs : elastic bulk index doc counts. (default 250) 
        elastic.max.bulk.size : elastic bulk index length. (default 2500500 ~2.5MB)
 
   I tried various E.S. versions and various combinations of settings, but still getting nowhere.  
   My elasticsearch.conf is empty (should I have something here?)
   Below is my nutch-site.xml (I was using indexer-elastic before but was getting the "No indexwriters found" errors. Then I saw there is indexer-elastic2 plugin)
 

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
  <property>
    <name>parser.character.encoding.default</name>
    <value>utf-8</value>
  </property>
  <property>
    <name>plugin.includes</name>
    <value>protocol-http|urlfilter-regex|parse-(html|tika|text)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|indexer-elastic2</value>
    <description>plugins</description>
  </property>
  <property>
    <name>storage.data.store.class</name>
    <value>org.apache.gora.hbase.store.HBaseStore</value>
    <description>Default class for storing data</description>
  </property>
  <property>
    <name>http.agent.name</name>
    <value>Crawler</value>
  </property>  
  <property>
    <name>http.robots.agents</name>
    <value>Crawler,*</value>
  </property>  
  <property>
    <name>http.robots.403.allow</name>
    <value>true</value>
  </property>
  <property>
    <name>http.timeout</name>
    <value>120000</value>
    <description>The default network timeout, in milliseconds.</description>
  </property>
  <property>
    <name>http.useHttp11</name>
    <value>true</value>
  </property>
  <property>
    <name>http.content.limit</name>
    <value>-1</value>
  </property>
  <property>
    <name>file.content.limit</name>
    <value>-1</value>
  </property>
  <property>
    <name>db.ignore.external.links</name>
    <value>true</value>
  </property>
  <property>
    <name>db.ignore.external.links.mode</name>
    <value>byDomain</value>
  </property>
  <property>
    <name>db.ignore.internal.links</name>
    <value>false</value>
  </property>
  <property>
    <name>generate.update.crawldb</name>
    <value>true</value>
  </property>
  <property>
    <name>fetcher.threads.fetch</name>
    <value>10</value>
  </property>
  <property>
    <name>fetcher.threads.per.queue</name>
    <value>10</value>
  </property>
  <property>
    <name>fetcher.server.delay</name>
    <value>1.0</value>
    <description>The number of seconds the fetcher will delay between 
     successive requests to the same server.</description>
  </property>
  <property>
    <name>fetcher.threads.per.host</name>
    <value>10</value>
    <description>This number is the maximum number of threads that
      should be allowed to access a host at one time.</description>
  </property>  
  <property>
    <name>db.fetch.interval.default</name>
    <value>18000</value>
    <description>The number of seconds between re-fetches of a page (5hours).</description>
  </property>  
  <property>
    <name>db.fetch.interval.max</name>
    <value>43200</value>
  </property>
  <property>
    <name>elastic.host</name>
    <value>172.20.128.4</value>
  </property>
  <property>
    <name>elastic.port</name>
    <value>9300</value>
  </property>
  <property>
    <name>elastic.cluster</name>
    <value>elasticsearch</value>
  </property>
  <property>
    <name>elastic.index</name>
    <value>nutchindex</value>
  </property>
  <property>
    <name>parser.character.encoding.default</name>
    <value>utf-8</value>
  </property>
  <property>
    <name>http.content.limit</name>
    <value>6553600</value>
  </property>
  <property>
    <name>elastic.max.bulk.docs</name>
    <value>250</value>
    <description>Maximum size of the bulk in number of documents.</description>
  </property>
  <property>
    <name>elastic.max.bulk.size</name>
    <value>2500500</value>
    <description>Maximum size of the bulk in bytes.</description>
  </property>
</configuration>