You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by devil devil <de...@gmx.com> on 2017/12/22 20:24:51 UTC
Nutch 2.x does not send index to ElasticSearch 2.3.3
Hello,
I am running nutch 2.x and elasticsearch 2.3.3 in two containers. I can log into nutch container and curl E.S. so connectivity is there. Inject/Fetch/etc all work fine. However when i get to nutch index elasticsearch, all i get is:
root@b211135e1be5:~/nutch/bin# ./nutch index elasticsearch -all
IndexingJob: starting
Active IndexWriters :
ElasticIndexWriter
elastic.cluster : elastic prefix cluster
elastic.host : hostname
elastic.port : port (default 9300)
elastic.index : elastic index command
elastic.max.bulk.docs : elastic bulk index doc counts. (default 250)
elastic.max.bulk.size : elastic bulk index length. (default 2500500 ~2.5MB)
I tried various E.S. versions and various combinations of settings, but still getting nowhere.
My elasticsearch.conf is empty (should I have something here?)
Below is my nutch-site.xml (I was using indexer-elastic before but was getting the "No indexwriters found" errors. Then I saw there is indexer-elastic2 plugin)
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
</property>
<property>
<name>plugin.includes</name>
<value>protocol-http|urlfilter-regex|parse-(html|tika|text)|index-(basic|anchor)|scoring-opic|urlnormalizer-(pass|regex|basic)|indexer-elastic2</value>
<description>plugins</description>
</property>
<property>
<name>storage.data.store.class</name>
<value>org.apache.gora.hbase.store.HBaseStore</value>
<description>Default class for storing data</description>
</property>
<property>
<name>http.agent.name</name>
<value>Crawler</value>
</property>
<property>
<name>http.robots.agents</name>
<value>Crawler,*</value>
</property>
<property>
<name>http.robots.403.allow</name>
<value>true</value>
</property>
<property>
<name>http.timeout</name>
<value>120000</value>
<description>The default network timeout, in milliseconds.</description>
</property>
<property>
<name>http.useHttp11</name>
<value>true</value>
</property>
<property>
<name>http.content.limit</name>
<value>-1</value>
</property>
<property>
<name>file.content.limit</name>
<value>-1</value>
</property>
<property>
<name>db.ignore.external.links</name>
<value>true</value>
</property>
<property>
<name>db.ignore.external.links.mode</name>
<value>byDomain</value>
</property>
<property>
<name>db.ignore.internal.links</name>
<value>false</value>
</property>
<property>
<name>generate.update.crawldb</name>
<value>true</value>
</property>
<property>
<name>fetcher.threads.fetch</name>
<value>10</value>
</property>
<property>
<name>fetcher.threads.per.queue</name>
<value>10</value>
</property>
<property>
<name>fetcher.server.delay</name>
<value>1.0</value>
<description>The number of seconds the fetcher will delay between
successive requests to the same server.</description>
</property>
<property>
<name>fetcher.threads.per.host</name>
<value>10</value>
<description>This number is the maximum number of threads that
should be allowed to access a host at one time.</description>
</property>
<property>
<name>db.fetch.interval.default</name>
<value>18000</value>
<description>The number of seconds between re-fetches of a page (5hours).</description>
</property>
<property>
<name>db.fetch.interval.max</name>
<value>43200</value>
</property>
<property>
<name>elastic.host</name>
<value>172.20.128.4</value>
</property>
<property>
<name>elastic.port</name>
<value>9300</value>
</property>
<property>
<name>elastic.cluster</name>
<value>elasticsearch</value>
</property>
<property>
<name>elastic.index</name>
<value>nutchindex</value>
</property>
<property>
<name>parser.character.encoding.default</name>
<value>utf-8</value>
</property>
<property>
<name>http.content.limit</name>
<value>6553600</value>
</property>
<property>
<name>elastic.max.bulk.docs</name>
<value>250</value>
<description>Maximum size of the bulk in number of documents.</description>
</property>
<property>
<name>elastic.max.bulk.size</name>
<value>2500500</value>
<description>Maximum size of the bulk in bytes.</description>
</property>
</configuration>