You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by hta9323 <ph...@gmail.com> on 2015/01/23 17:52:12 UTC

Nutch 1.7 indexing throws exception - java.lang.Exception: java.lang.NullPointerException

Hello,

I get the following in my hadloop log file when trying to crawl my site.
I've read there might be a disconnect between the solrindex-mapping.xml file
and solr schema.xml file, but upon inspection I do not see anything off...

I have been able to crawl my site before and am not sure why it has suddenly
stopped working. Can any point me in the right direction?

*solrindex-mapping file*
<fields>
		<field dest="content" source="content"/>
		<field dest="title" source="title"/>
		<field dest="host" source="host"/>
		<field dest="segment" source="segment"/>
		<field dest="boost" source="boost"/>
		<field dest="digest" source="digest"/>
		<field dest="tstamp" source="tstamp"/>
		<field dest="id" source="url"/>
		<field dest="tagids" source="metatag.tagids" />
		<field dest="system_type" source="metatag.system_type" />
		<field dest="published_date" source="metatag.published_date" />
                <field dest="published_by" source="metatag.published_by" />
                <field dest="begin_date" source="metatag.begin_date" />
                <field dest="end_date" source="metatag.end_date" />
                <field dest="city" source="metatag.city" />
                <field dest="state_name" source="metatag.state_name" />
                <field dest="state_code" source="metatag.state_code" />
		<copyField source="url" dest="url"/>
	</fields>
	<uniqueKey>id</uniqueKey>

*solr schema fields (those that relate to nutch)*
<field name="segment" type="string" stored="true" indexed="false"/>
  <field name="digest" type="string" stored="true" indexed="false"/>
  <field name="boost" type="float" stored="true" indexed="false"/>

   
  
  <field name="host" type="string" stored="false" indexed="true"/>
  <field name="url" type="url" stored="true" indexed="true"
required="false"/>
  <field name="content" type="text_general" stored="true" indexed="true"/>
  <field name="title" type="text_general" stored="true" indexed="true"/>
  <field name="cache" type="string" stored="true" indexed="false"/>
  <field name="tstamp" type="date" stored="true" indexed="false"/>

  
  <field name="anchor" type="string" stored="true" indexed="true"
multiValued="true"/>

  
  <field name="type" type="string" stored="true" indexed="true"
multiValued="true"/>
  <field name="contentLength" type="long" stored="true" indexed="false"/>
  <field name="lastModified" type="date" stored="true" indexed="false"/>
  <field name="date" type="date" stored="true" indexed="true"/>

  
  <field name="lang" type="string" stored="true" indexed="true"/>

  
  <field name="subcollection" type="string" stored="true" indexed="true"
multiValued="true"/>

  
  <field name="author" type="string" stored="true" indexed="true"/>
  <field name="tag" type="string" stored="true" indexed="true"
multiValued="true"/>
  <field name="feed" type="string" stored="true" indexed="true"/>
  <field name="publishedDate" type="date" stored="true" indexed="true"/>
  <field name="updatedDate" type="date" stored="true" indexed="true"/>

  
  <field name="cc" type="string" stored="true" indexed="true"
multiValued="true"/>
      
      
  <field name="tld" type="string" stored="false" indexed="false"/>

  
  <field name="metatag.description" type="text" stored="true"
indexed="true"/>
  <field name="metatag.keywords" type="text" stored="true" indexed="true"
multiValued="true"/>
  <field name="metatag.tagids" type="text" stored="true" indexed="true"
multiValued="true"/>

*nutch schema fields*
    <fields>
        <field name="id" type="string" stored="true" indexed="true"
required="true" />

        
        <field name="segment" type="string" stored="true" indexed="false"/>
        <field name="digest" type="string" stored="true" indexed="false"/>
        <field name="boost" type="float" stored="true" indexed="false"/>

        
        <field name="host" type="string" stored="false" indexed="true"/>
        <field name="url" type="url" stored="true" indexed="true"
            required="true"/>
        <field name="content" type="text_general" stored="true"
indexed="true"/>
        <field name="title" type="text_general" stored="true"
indexed="true"/>
        <field name="cache" type="string" stored="true" indexed="false"/>
        <field name="tstamp" type="date" stored="true" indexed="false"/>

        
        <field name="anchor" type="string" stored="true" indexed="true"
            multiValued="true"/>

        
        <field name="type" type="string" stored="true" indexed="true"
            multiValued="true"/>
        <field name="contentLength" type="long" stored="true"
            indexed="false"/>
        <field name="lastModified" type="date" stored="true"
            indexed="false"/>
        <field name="date" type="date" stored="true" indexed="true"/>

        
        <field name="lang" type="string" stored="true" indexed="true"/>

        
        <field name="subcollection" type="string" stored="true"
            indexed="true" multiValued="true"/>

        
        <field name="author" type="string" stored="true" indexed="true"/>
        <field name="tag" type="string" stored="true" indexed="true"
multiValued="true"/>
        <field name="feed" type="string" stored="true" indexed="true"/>
        <field name="publishedDate" type="date" stored="true"
            indexed="true"/>
        <field name="updatedDate" type="date" stored="true"
            indexed="true"/>

        
        <field name="cc" type="string" stored="true" indexed="true"
            multiValued="true"/>
            
            
        <field name="tld" type="string" stored="false" indexed="false"/>


    </fields>

*hadloop log*
SOLRIndexWriter
	solr.server.url : URL of the SOLR instance (mandatory)
	solr.commit.size : buffer size when sending to SOLR (default 1000)
	solr.mapping.file : name of the mapping file for fields (default
solrindex-mapping.xml)
	solr.auth : use authentication (default false)
	solr.auth.username : use authentication (default false)
	solr.auth : username for authentication
	solr.auth.password : password for authentication


2015-01-23 09:50:12,968 INFO  indexer.IndexerMapReduce - IndexerMapReduce:
crawldb: crawl-20150123094942/crawldb
2015-01-23 09:50:12,968 INFO  indexer.IndexerMapReduce - IndexerMapReduce:
linkdb: crawl-20150123094942/linkdb
2015-01-23 09:50:12,968 INFO  indexer.IndexerMapReduce - IndexerMapReduces:
adding segment:
file:/usr/share/apache-nutch-1.7/crawl-20150123094942/segments/20150123094955
2015-01-23 09:50:13,187 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,188 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,188 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,190 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,268 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,268 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,269 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,269 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,347 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,347 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,347 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,347 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,424 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,424 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,424 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,425 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,502 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,502 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,502 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,502 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,580 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,580 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,580 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,580 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,599 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.basic.BasicIndexingFilter
2015-01-23 09:50:13,599 INFO  anchor.AnchorIndexingFilter - Anchor
deduplication is: off
2015-01-23 09:50:13,599 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.anchor.AnchorIndexingFilter
2015-01-23 09:50:13,599 INFO  indexer.IndexingFilters - Adding
org.apache.nutch.indexer.metadata.MetadataIndexer
2015-01-23 09:50:13,600 INFO  indexer.IndexWriters - Adding
org.apache.nutch.indexwriter.solr.SolrIndexWriter
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: content dest:
content
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: title dest:
title
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: host dest:
host
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: segment dest:
segment
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: boost dest:
boost
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: digest dest:
digest
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: tstamp dest:
tstamp
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: url dest: id
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.tagids dest: tagids
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.system_type dest: system_type
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.published_date dest: published_date
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.published_by dest: published_by
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.begin_date dest: begin_date
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.end_date dest: end_date
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: metatag.city
dest: city
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.state_name dest: state_name
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source:
metatag.state_code dest: state_code
2015-01-23 09:50:13,612 INFO  solr.SolrMappingReader - source: url dest: url
2015-01-23 09:50:14,091 INFO  solr.SolrMappingReader - source: content dest:
content
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: title dest:
title
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: host dest:
host
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: segment dest:
segment
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: boost dest:
boost
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: digest dest:
digest
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: tstamp dest:
tstamp
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source: url dest: id
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source:
metatag.tagids dest: tagids
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source:
metatag.system_type dest: system_type
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source:
metatag.published_date dest: published_date
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source:
metatag.published_by dest: published_by
2015-01-23 09:50:14,092 INFO  solr.SolrMappingReader - source:
metatag.begin_date dest: begin_date
2015-01-23 09:50:14,093 INFO  solr.SolrMappingReader - source:
metatag.end_date dest: end_date
2015-01-23 09:50:14,093 INFO  solr.SolrMappingReader - source: metatag.city
dest: city
2015-01-23 09:50:14,093 INFO  solr.SolrMappingReader - source:
metatag.state_name dest: state_name
2015-01-23 09:50:14,093 INFO  solr.SolrMappingReader - source:
metatag.state_code dest: state_code
2015-01-23 09:50:14,093 INFO  solr.SolrMappingReader - source: url dest: url
2015-01-23 09:50:14,174 INFO  indexer.IndexingJob - Indexer: finished at
2015-01-23 09:50:14, elapsed: 00:00:01
2015-01-23 09:50:14,177 INFO  solr.SolrDeleteDuplicates -
SolrDeleteDuplicates: starting at 2015-01-23 09:50:14
2015-01-23 09:50:14,177 INFO  solr.SolrDeleteDuplicates -
SolrDeleteDuplicates: Solr url: http://10.0.51.35:8080/solr/core0/
2015-01-23 09:50:14,712 WARN  mapred.FileOutputCommitter - Output path is
null in cleanup
2015-01-23 09:50:14,713 WARN  mapred.LocalJobRunner - job_local85452124_0011
java.lang.Exception: java.lang.NullPointerException
	at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:354)
Caused by: java.lang.NullPointerException
	at org.apache.hadoop.io.Text.encode(Text.java:388)
	at org.apache.hadoop.io.Text.set(Text.java:178)
	at
org.apache.nutch.indexer.solr.SolrDeleteDuplicates$SolrInputFormat$1.next(SolrDeleteDuplicates.java:270)
	at
org.apache.nutch.indexer.solr.SolrDeleteDuplicates$SolrInputFormat$1.next(SolrDeleteDuplicates.java:241)
	at
org.apache.hadoop.mapred.MapTask$TrackedRecordReader.moveToNext(MapTask.java:230)
	at
org.apache.hadoop.mapred.MapTask$TrackedRecordReader.next(MapTask.java:210)
	at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:48)
	at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:430)
	at org.apache.hadoop.mapred.MapTask.run(MapTask.java:366)
	at
org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:223)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
	at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334)
	at java.util.concurrent.FutureTask.run(FutureTask.java:166)
	at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1146)
	at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
	at java.lang.Thread.run(Thread.java:701)



--
View this message in context: http://lucene.472066.n3.nabble.com/Nutch-1-7-indexing-throws-exception-java-lang-Exception-java-lang-NullPointerException-tp4181531.html
Sent from the Nutch - User mailing list archive at Nabble.com.