You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by "Arthur.hk.chan@gmail.com" <ar...@gmail.com> on 2014/12/03 00:03:00 UTC

org.apache.solr.common.SolrException, unknown field 'host'

Hi,

I am new to Nutch and Solr, please help!!

I am using Nutch-1.9, Solr 4.10.2 and Hadoop 2.4.1
I always get  org.apache.solr.common.SolrException, unknown field ‘host’, what would be wrong?

The schema.xml has <field name="host" type="string" stored="false" indexed="true"/>
I have already copied [nutch] schema.xml to [solr] schema.xml, restarted solr, 
From the indexing phase, Nutch always returns "unknown field ‘host’” error,  

Below are my settings. What would be wrong? 

Regards
Arthur




input_url/seed.txt 
http://nutch.apache.org/



conf/solrindex-mapping.xml
<?xml version="1.0" encoding="UTF-8"?>
<mapping>
	<fields>
		<field dest="content" source="content"/>
		<field dest="title" source="title"/>
		<field dest="host" source="host"/>
		<field dest="segment" source="segment"/>
		<field dest="boost" source="boost"/>
		<field dest="digest" source="digest"/>
		<field dest="tstamp" source="tstamp"/>
                <field dest="subject" source="subject"/>
                <field dest="description" source="description"/>
                <field dest="comments" source="comments"/>
                <field dest="author" source="author"/>
                <field dest="keywords" source="keywords"/>
                <field dest="category" source="category"/> 
                <field dest="lastModified" source="lastModified"/>
	</fields>
	<uniqueKey>id</uniqueKey>
</mapping>



conf/regex-urlfilter.txt
# The default url filter.
# Better for whole-internet crawling.
# skip file: ftp: and mailto: urls
-^(file|ftp|mailto):
# skip image and other suffixes we can't yet parse
# for a more extensive coverage use the urlfilter-suffix plugin
-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
# skip URLs containing certain characters as probable queries, etc.
-[?*!@=]
# skip URLs with slash-delimited segment that repeats 3+ times, to break loops
-.*(/[^/]+)/[^/]+\1/[^/]+\1/
# accept anything else
#+.
+^http://([a-z0-9]*\.)*nutch.apache.org/



conf/nutch-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
 <property>
  <name>http.agent.name</name>
  <value>MyBot</value>
 </property>
 <property>
  <name>http.robots.agents</name>
  <value>MyBot,*</value>
 </property>
 <property>
  <name>fetcher.store.content</name>
  <value>true</value>
 </property>
 <property>
  <name>fetcher.max.crawl.delay</name>
  <value>-1</value>
 </property>
  <property>
  <name>plugin.includes</name>
  <value>protocol-http|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|indexer-solr|urlnormalizer-(pass|regex|basic)</value>
 </property>
 <property>
  <name>mapred.temp.dir</name>
  <value>/tmp</value>
 </property>
<property>
<name>metatags.names</name>
<value>metatag.keywords;metatag.description</value>
</property>
<property>
  <name>index.parse.md</name>
  <value>metatag.description,metatag.keywords</value>
</property>
</configuration>



solr/collection1/conf/schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="nutch" version="1.5">
    <types>
        <fieldType name="string" class="solr.StrField" sortMissingLast="true"
            omitNorms="true"/> 
        <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
            omitNorms="true" positionIncrementGap="0"/>
        <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
            omitNorms="true" positionIncrementGap="0"/>
        <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
            omitNorms="true" positionIncrementGap="0"/>

        <fieldType name="text" class="solr.TextField"
            positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.WhitespaceTokenizerFactory"/>
                <filter class="solr.StopFilterFactory"
                    ignoreCase="true" words="stopwords.txt"/>
                <filter class="solr.WordDelimiterFilterFactory"
                    generateWordParts="1" generateNumberParts="1"
                    catenateWords="1" catenateNumbers="1" catenateAll="0"
                    splitOnCaseChange="1"/>
                <filter class="solr.LowerCaseFilterFactory"/>
               <!--  <filter class="solr.EnglishPorterFilterFactory"
                    protected="protwords.txt"/> -->
                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
            </analyzer>
        </fieldType>
        <fieldType name="url" class="solr.TextField"
            positionIncrementGap="100">
            <analyzer>
                <tokenizer class="solr.StandardTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory"/>
                <filter class="solr.WordDelimiterFilterFactory"
                    generateWordParts="1" generateNumberParts="1"/>
            </analyzer>
        </fieldType>
    </types>
    <fields>

      <field name="_root_" type="string" indexed="true" stored="false"/>
        <field name="id" type="string" stored="true" indexed="true" required="true"/>

        <!-- core fields -->
        <field name="_version_" type="long" indexed="true" stored="true"/>
        <field name="host" type="string" stored="false" indexed="true"/>   
        <field name="digest" type="string" stored="true" indexed="false"/>   
        <field name="segment" type="string" stored="true" indexed="false"/>   
        <field name="boost" type="float" stored="true" indexed="false"/>   
        <field name="tstamp" type="date" stored="true" indexed="false"/>  

        <field name="url" type="text" indexed="true" stored="true" required="true"/> 
        <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
        <field name="last_modified" type="date" indexed="true" stored="true"/>
        <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>

        <!-- fields for the metatags plugin -->
        <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
        <field name="subject" type="text" indexed="true" stored="true"/>
        <field name="description" type="text" stored="true" indexed="true"/>
        <field name="comments" type="text" indexed="true" stored="true"/>
        <field name="author" type="text" indexed="true" stored="true"/>
        <field name="keywords" type="text" stored="true" indexed="true"/>
        <field name="category" type="text" indexed="true" stored="true"/>
        <field name="resourcename" type="text" indexed="true" stored="true"/>

        <!-- fields for index-basic plugin -->
        <field name="content" type="text" indexed="true" stored="true" multiValued="true"/> 
        <field name="title" type="text" stored="true" indexed="true"/>
        <field name="cache" type="string" stored="true" indexed="false"/>

        <!-- fields for index-anchor plugin -->
        <field name="anchor" type="string" stored="true" indexed="true"
            multiValued="true"/>

        <!-- fields for index-more plugin -->
        <field name="type" type="string" stored="true" indexed="true"
            multiValued="true"/>
        <field name="contentLength" type="long" stored="true"
            indexed="false"/>
        <field name="lastModified" type="date" stored="true"
            indexed="false"/>
        <field name="date" type="date" stored="true" indexed="true"/>

        <!-- fields for languageidentifier plugin -->
        <field name="lang" type="string" stored="true" indexed="true"/>

        <!-- fields for subcollection plugin -->
        <field name="subcollection" type="string" stored="true"
            indexed="true" multiValued="true"/>

        <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
        <field name="author" type="string" stored="true" indexed="true"/>
        <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
        <field name="feed" type="string" stored="true" indexed="true"/>
        <field name="publishedDate" type="date" stored="true"
            indexed="true"/>
        <field name="updatedDate" type="date" stored="true"
            indexed="true"/>

        <!-- fields for creativecommons plugin -->
        <field name="cc" type="string" stored="true" indexed="true"
            multiValued="true"/>
            
        <!-- fields for tld plugin -->    
        <field name="tld" type="string" stored="false" indexed="false"/>
    </fields>
    <!--<uniqueKey>id</uniqueKey> -->
    <uniqueKey>url</uniqueKey>
    <defaultSearchField>content</defaultSearchField>
    <solrQueryParser defaultOperator="OR"/>
</schema>





Nutch log:
2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: starting at 2014-12-03 06:53:12
2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: linkdb: output_url/linkdb
2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: URL normalize: true
2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: URL filter: true
2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: internal links will be ignored.
2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: adding segment: output_url/segments/20141203064933
2014-12-03 06:53:13,688 INFO  crawl.LinkDb - LinkDb: merging with existing linkdb: output_url/linkdb
2014-12-03 06:53:14,755 INFO  crawl.LinkDb - LinkDb: finished at 2014-12-03 06:53:14, elapsed: 00:00:02
2014-12-03 06:53:15,085 INFO  crawl.DeduplicationJob - DeduplicationJob: starting at 2014-12-03 06:53:15
2014-12-03 06:53:15,204 WARN  util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2014-12-03 06:53:16,365 INFO  crawl.DeduplicationJob - Deduplication: 0 documents marked as duplicates
2014-12-03 06:53:16,365 INFO  crawl.DeduplicationJob - Deduplication: Updating status of duplicate urls into crawl db.
2014-12-03 06:53:17,466 INFO  crawl.DeduplicationJob - Deduplication finished at 2014-12-03 06:53:17, elapsed: 00:00:02
2014-12-03 06:53:17,819 INFO  indexer.IndexingJob - Indexer: starting at 2014-12-03 06:53:17
2014-12-03 06:53:17,859 INFO  indexer.IndexingJob - Indexer: deleting gone documents: false
2014-12-03 06:53:17,860 INFO  indexer.IndexingJob - Indexer: URL filtering: false
2014-12-03 06:53:17,860 INFO  indexer.IndexingJob - Indexer: URL normalizing: false
2014-12-03 06:53:17,969 INFO  indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter
2014-12-03 06:53:17,969 INFO  indexer.IndexingJob - Active IndexWriters :
SOLRIndexWriter
	solr.server.url : URL of the SOLR instance (mandatory)
	solr.commit.size : buffer size when sending to SOLR (default 1000)
	solr.mapping.file : name of the mapping file for fields (default solrindex-mapping.xml)
	solr.auth : use authentication (default false)
	solr.auth.username : use authentication (default false)
	solr.auth : username for authentication
	solr.auth.password : password for authentication
2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduce: crawldb: output_url/crawldb
2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduce: linkdb: output_url/linkdb
2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduces: adding segment: output_url/segments/20141203064933
2014-12-03 06:53:18,038 WARN  util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2014-12-03 06:53:18,273 INFO  anchor.AnchorIndexingFilter - Anchor deduplication is: off
2014-12-03 06:53:18,657 INFO  indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: content dest: content
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: title dest: title
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: host dest: host
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: segment dest: segment
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: boost dest: boost
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: digest dest: digest
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: tstamp dest: tstamp
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: subject dest: subject
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: description dest: description
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: comments dest: comments
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: author dest: author
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: keywords dest: keywords
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: category dest: category
2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: lastModified dest: lastModified
2014-12-03 06:53:18,742 INFO  solr.SolrIndexWriter - Indexing 39 documents
2014-12-03 06:53:18,793 INFO  solr.SolrIndexWriter - Indexing 39 documents
2014-12-03 06:53:18,805 WARN  mapred.LocalJobRunner - job_local637755932_0001
org.apache.solr.common.SolrException: Bad Request
Bad Request
request: http://192.168.0.1:8983/solr/update?wt=javabin&version=2
	at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:430)
	at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:244)
	at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:105)
	at org.apache.nutch.indexwriter.solr.SolrIndexWriter.close(SolrIndexWriter.java:155)
	at org.apache.nutch.indexer.IndexWriters.close(IndexWriters.java:118)
	at org.apache.nutch.indexer.IndexerOutputFormat$1.close(IndexerOutputFormat.java:44)
	at org.apache.hadoop.mapred.ReduceTask$OldTrackingRecordWriter.close(ReduceTask.java:467)
	at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:535)
	at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421)
	at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:398)
2014-12-03 06:53:19,196 ERROR indexer.IndexingJob - Indexer: java.io.IOException: Job failed!
	at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1357)
	at org.apache.nutch.indexer.IndexingJob.index(IndexingJob.java:114)
	at org.apache.nutch.indexer.IndexingJob.run(IndexingJob.java:176)
	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
	at org.apache.nutch.indexer.IndexingJob.main(IndexingJob.java:186)




Solr log: logs/solr.log
INFO  - 2014-12-03 06:53:18.776; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0
ERROR - 2014-12-03 06:53:18.777; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host'
	at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185)
	at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78)
	at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238)
	at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164)
	at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
	at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
	at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926)
	at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080)
	at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692)
	at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
	at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247)
	at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174)
	at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99)
	at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967)
	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207)
	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419)
	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557)
	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075)
	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384)
	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
	at org.eclipse.jetty.server.Server.handle(Server.java:368)
	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489)
	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953)
	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014)
	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953)
	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
	at java.lang.Thread.run(Thread.java:745)
INFO  - 2014-12-03 06:53:18.796; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0
ERROR - 2014-12-03 06:53:18.797; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host'
	at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185)
	at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78)
	at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238)
	at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164)
	at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
	at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
	at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926)
	at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080)
	at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692)
	at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
	at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247)
	at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174)
	at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99)
	at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967)
	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418)
	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207)
	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419)
	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557)
	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075)
	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384)
	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
	at org.eclipse.jetty.server.Server.handle(Server.java:368)
	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489)
	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953)
	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014)
	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953)
	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
	at java.lang.Thread.run(Thread.java:745)



Re: org.apache.solr.common.SolrException, unknown field 'host'

Posted by "Arthur.hk.chan@gmail.com" <ar...@gmail.com>.
Hi,


in my sole’s conf/schema.xml, I have added 
>> <field name="host" type="string" stored="false" indexed="true"/>  



I have tried to query the field ‘host':
curl http://localhost:8983/solr/collection1/schema/fields/host?wt=json
27420 [qtp1115092137-12] INFO  org.apache.solr.rest.schema.FieldResource  – [collection1] webapp=/solr path=/schema/fields/host params={wt=json} msg={Field 'host' not found.} status=404 QTime=1 
Dec 04, 2014 7:46:41 PM org.restlet.engine.log.LogFilter afterHandle
INFO: 2014-12-04	19:46:41	0:0:0:0:0:0:0:1	-	0:0:0:0:0:0:0:1	8983	GET	/solr/schema/fields/host	wt=json	404	-	0	1	http://localhost:8983	curl/7.19.7 (x86_64-redhat-linux-gnu) libcurl/7.19.7 NSS/3.16.1 Basic ECC zlib/1.2.3 libidn/1.18 libssh2/1.4.2	-
{
  "responseHeader":{
    "status":404,
    "QTime":1},
  "error":{
    "msg":"Field 'host' not found.",
    "code":404}}

Any idea why the field ‘host’ is not loaded by SOLR?

Regards
Arthur


On 3 Dec, 2014, at 8:33 pm, Arthur.hk.chan@gmail.com <ar...@gmail.com> wrote:

> can anyone help ?
> 
> On 3 Dec, 2014, at 7:03 am, Arthur.hk.chan@gmail.com <ar...@gmail.com> wrote:
> 
>> Hi,
>> 
>> I am new to Nutch and Solr, please help!!
>> 
>> I am using Nutch-1.9, Solr 4.10.2 and Hadoop 2.4.1
>> I always get  org.apache.solr.common.SolrException, unknown field ‘host’, what would be wrong?
>> 
>> The schema.xml has <field name="host" type="string" stored="false" indexed="true"/>
>> I have already copied [nutch] schema.xml to [solr] schema.xml, restarted solr, 
>> From the indexing phase, Nutch always returns "unknown field ‘host’” error,  
>> 
>> Below are my settings. What would be wrong? 
>> 
>> Regards
>> Arthur
>> 
>> 
>> 
>> 
>> input_url/seed.txt 
>> http://nutch.apache.org/
>> 
>> 
>> 
>> conf/solrindex-mapping.xml
>> <?xml version="1.0" encoding="UTF-8"?>
>> <mapping>
>> 	<fields>
>> 		<field dest="content" source="content"/>
>> 		<field dest="title" source="title"/>
>> 		<field dest="host" source="host"/>
>> 		<field dest="segment" source="segment"/>
>> 		<field dest="boost" source="boost"/>
>> 		<field dest="digest" source="digest"/>
>> 		<field dest="tstamp" source="tstamp"/>
>>                 <field dest="subject" source="subject"/>
>>                 <field dest="description" source="description"/>
>>                 <field dest="comments" source="comments"/>
>>                 <field dest="author" source="author"/>
>>                 <field dest="keywords" source="keywords"/>
>>                 <field dest="category" source="category"/> 
>>                 <field dest="lastModified" source="lastModified"/>
>> 	</fields>
>> 	<uniqueKey>id</uniqueKey>
>> </mapping>
>> 
>> 
>> 
>> conf/regex-urlfilter.txt
>> # The default url filter.
>> # Better for whole-internet crawling.
>> # skip file: ftp: and mailto: urls
>> -^(file|ftp|mailto):
>> # skip image and other suffixes we can't yet parse
>> # for a more extensive coverage use the urlfilter-suffix plugin
>> -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
>> # skip URLs containing certain characters as probable queries, etc.
>> -[?*!@=]
>> # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
>> -.*(/[^/]+)/[^/]+\1/[^/]+\1/
>> # accept anything else
>> #+.
>> +^http://([a-z0-9]*\.)*nutch.apache.org/
>> 
>> 
>> 
>> conf/nutch-site.xml
>> <?xml version="1.0"?>
>> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
>> <!-- Put site-specific property overrides in this file. -->
>> <configuration>
>>  <property>
>>   <name>http.agent.name</name>
>>   <value>MyBot</value>
>>  </property>
>>  <property>
>>   <name>http.robots.agents</name>
>>   <value>MyBot,*</value>
>>  </property>
>>  <property>
>>   <name>fetcher.store.content</name>
>>   <value>true</value>
>>  </property>
>>  <property>
>>   <name>fetcher.max.crawl.delay</name>
>>   <value>-1</value>
>>  </property>
>>   <property>
>>   <name>plugin.includes</name>
>>   <value>protocol-http|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|indexer-solr|urlnormalizer-(pass|regex|basic)</value>
>>  </property>
>>  <property>
>>   <name>mapred.temp.dir</name>
>>   <value>/tmp</value>
>>  </property>
>> <property>
>> <name>metatags.names</name>
>> <value>metatag.keywords;metatag.description</value>
>> </property>
>> <property>
>>   <name>index.parse.md</name>
>>   <value>metatag.description,metatag.keywords</value>
>> </property>
>> </configuration>
>> 
>> 
>> 
>> solr/collection1/conf/schema.xml
>> <?xml version="1.0" encoding="UTF-8" ?>
>> <schema name="nutch" version="1.5">
>>     <types>
>>         <fieldType name="string" class="solr.StrField" sortMissingLast="true"
>>             omitNorms="true"/> 
>>         <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
>>             omitNorms="true" positionIncrementGap="0"/>
>>         <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
>>             omitNorms="true" positionIncrementGap="0"/>
>>         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
>>             omitNorms="true" positionIncrementGap="0"/>
>> 
>>         <fieldType name="text" class="solr.TextField"
>>             positionIncrementGap="100">
>>             <analyzer>
>>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>>                 <filter class="solr.StopFilterFactory"
>>                     ignoreCase="true" words="stopwords.txt"/>
>>                 <filter class="solr.WordDelimiterFilterFactory"
>>                     generateWordParts="1" generateNumberParts="1"
>>                     catenateWords="1" catenateNumbers="1" catenateAll="0"
>>                     splitOnCaseChange="1"/>
>>                 <filter class="solr.LowerCaseFilterFactory"/>
>>                <!--  <filter class="solr.EnglishPorterFilterFactory"
>>                     protected="protwords.txt"/> -->
>>                 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>>             </analyzer>
>>         </fieldType>
>>         <fieldType name="url" class="solr.TextField"
>>             positionIncrementGap="100">
>>             <analyzer>
>>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>>                 <filter class="solr.LowerCaseFilterFactory"/>
>>                 <filter class="solr.WordDelimiterFilterFactory"
>>                     generateWordParts="1" generateNumberParts="1"/>
>>             </analyzer>
>>         </fieldType>
>>     </types>
>>     <fields>
>> 
>>       <field name="_root_" type="string" indexed="true" stored="false"/>
>>         <field name="id" type="string" stored="true" indexed="true" required="true"/>
>> 
>>         <!-- core fields -->
>>         <field name="_version_" type="long" indexed="true" stored="true"/>
>>         <field name="host" type="string" stored="false" indexed="true"/>   
>>         <field name="digest" type="string" stored="true" indexed="false"/>   
>>         <field name="segment" type="string" stored="true" indexed="false"/>   
>>         <field name="boost" type="float" stored="true" indexed="false"/>   
>>         <field name="tstamp" type="date" stored="true" indexed="false"/>  
>> 
>>         <field name="url" type="text" indexed="true" stored="true" required="true"/> 
>>         <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
>>         <field name="last_modified" type="date" indexed="true" stored="true"/>
>>         <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
>> 
>>         <!-- fields for the metatags plugin -->
>>         <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
>>         <field name="subject" type="text" indexed="true" stored="true"/>
>>         <field name="description" type="text" stored="true" indexed="true"/>
>>         <field name="comments" type="text" indexed="true" stored="true"/>
>>         <field name="author" type="text" indexed="true" stored="true"/>
>>         <field name="keywords" type="text" stored="true" indexed="true"/>
>>         <field name="category" type="text" indexed="true" stored="true"/>
>>         <field name="resourcename" type="text" indexed="true" stored="true"/>
>> 
>>         <!-- fields for index-basic plugin -->
>>         <field name="content" type="text" indexed="true" stored="true" multiValued="true"/> 
>>         <field name="title" type="text" stored="true" indexed="true"/>
>>         <field name="cache" type="string" stored="true" indexed="false"/>
>> 
>>         <!-- fields for index-anchor plugin -->
>>         <field name="anchor" type="string" stored="true" indexed="true"
>>             multiValued="true"/>
>> 
>>         <!-- fields for index-more plugin -->
>>         <field name="type" type="string" stored="true" indexed="true"
>>             multiValued="true"/>
>>         <field name="contentLength" type="long" stored="true"
>>             indexed="false"/>
>>         <field name="lastModified" type="date" stored="true"
>>             indexed="false"/>
>>         <field name="date" type="date" stored="true" indexed="true"/>
>> 
>>         <!-- fields for languageidentifier plugin -->
>>         <field name="lang" type="string" stored="true" indexed="true"/>
>> 
>>         <!-- fields for subcollection plugin -->
>>         <field name="subcollection" type="string" stored="true"
>>             indexed="true" multiValued="true"/>
>> 
>>         <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
>>         <field name="author" type="string" stored="true" indexed="true"/>
>>         <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
>>         <field name="feed" type="string" stored="true" indexed="true"/>
>>         <field name="publishedDate" type="date" stored="true"
>>             indexed="true"/>
>>         <field name="updatedDate" type="date" stored="true"
>>             indexed="true"/>
>> 
>>         <!-- fields for creativecommons plugin -->
>>         <field name="cc" type="string" stored="true" indexed="true"
>>             multiValued="true"/>
>>             
>>         <!-- fields for tld plugin -->    
>>         <field name="tld" type="string" stored="false" indexed="false"/>
>>     </fields>
>>     <!--<uniqueKey>id</uniqueKey> -->
>>     <uniqueKey>url</uniqueKey>
>>     <defaultSearchField>content</defaultSearchField>
>>     <solrQueryParser defaultOperator="OR"/>
>> </schema>
>> 
>> 
>> 
>> 
>> 
>> Nutch log:
>> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: starting at 2014-12-03 06:53:12
>> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: linkdb: output_url/linkdb
>> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: URL normalize: true
>> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: URL filter: true
>> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: internal links will be ignored.
>> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: adding segment: output_url/segments/20141203064933
>> 2014-12-03 06:53:13,688 INFO  crawl.LinkDb - LinkDb: merging with existing linkdb: output_url/linkdb
>> 2014-12-03 06:53:14,755 INFO  crawl.LinkDb - LinkDb: finished at 2014-12-03 06:53:14, elapsed: 00:00:02
>> 2014-12-03 06:53:15,085 INFO  crawl.DeduplicationJob - DeduplicationJob: starting at 2014-12-03 06:53:15
>> 2014-12-03 06:53:15,204 WARN  util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
>> 2014-12-03 06:53:16,365 INFO  crawl.DeduplicationJob - Deduplication: 0 documents marked as duplicates
>> 2014-12-03 06:53:16,365 INFO  crawl.DeduplicationJob - Deduplication: Updating status of duplicate urls into crawl db.
>> 2014-12-03 06:53:17,466 INFO  crawl.DeduplicationJob - Deduplication finished at 2014-12-03 06:53:17, elapsed: 00:00:02
>> 2014-12-03 06:53:17,819 INFO  indexer.IndexingJob - Indexer: starting at 2014-12-03 06:53:17
>> 2014-12-03 06:53:17,859 INFO  indexer.IndexingJob - Indexer: deleting gone documents: false
>> 2014-12-03 06:53:17,860 INFO  indexer.IndexingJob - Indexer: URL filtering: false
>> 2014-12-03 06:53:17,860 INFO  indexer.IndexingJob - Indexer: URL normalizing: false
>> 2014-12-03 06:53:17,969 INFO  indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter
>> 2014-12-03 06:53:17,969 INFO  indexer.IndexingJob - Active IndexWriters :
>> SOLRIndexWriter
>> 	solr.server.url : URL of the SOLR instance (mandatory)
>> 	solr.commit.size : buffer size when sending to SOLR (default 1000)
>> 	solr.mapping.file : name of the mapping file for fields (default solrindex-mapping.xml)
>> 	solr.auth : use authentication (default false)
>> 	solr.auth.username : use authentication (default false)
>> 	solr.auth : username for authentication
>> 	solr.auth.password : password for authentication
>> 2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduce: crawldb: output_url/crawldb
>> 2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduce: linkdb: output_url/linkdb
>> 2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduces: adding segment: output_url/segments/20141203064933
>> 2014-12-03 06:53:18,038 WARN  util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
>> 2014-12-03 06:53:18,273 INFO  anchor.AnchorIndexingFilter - Anchor deduplication is: off
>> 2014-12-03 06:53:18,657 INFO  indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: content dest: content
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: title dest: title
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: host dest: host
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: segment dest: segment
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: boost dest: boost
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: digest dest: digest
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: tstamp dest: tstamp
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: subject dest: subject
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: description dest: description
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: comments dest: comments
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: author dest: author
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: keywords dest: keywords
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: category dest: category
>> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: lastModified dest: lastModified
>> 2014-12-03 06:53:18,742 INFO  solr.SolrIndexWriter - Indexing 39 documents
>> 2014-12-03 06:53:18,793 INFO  solr.SolrIndexWriter - Indexing 39 documents
>> 2014-12-03 06:53:18,805 WARN  mapred.LocalJobRunner - job_local637755932_0001
>> org.apache.solr.common.SolrException: Bad Request
>> Bad Request
>> request: http://192.168.0.1:8983/solr/update?wt=javabin&version=2
>> 	at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:430)
>> 	at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:244)
>> 	at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:105)
>> 	at org.apache.nutch.indexwriter.solr.SolrIndexWriter.close(SolrIndexWriter.java:155)
>> 	at org.apache.nutch.indexer.IndexWriters.close(IndexWriters.java:118)
>> 	at org.apache.nutch.indexer.IndexerOutputFormat$1.close(IndexerOutputFormat.java:44)
>> 	at org.apache.hadoop.mapred.ReduceTask$OldTrackingRecordWriter.close(ReduceTask.java:467)
>> 	at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:535)
>> 	at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421)
>> 	at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:398)
>> 2014-12-03 06:53:19,196 ERROR indexer.IndexingJob - Indexer: java.io.IOException: Job failed!
>> 	at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1357)
>> 	at org.apache.nutch.indexer.IndexingJob.index(IndexingJob.java:114)
>> 	at org.apache.nutch.indexer.IndexingJob.run(IndexingJob.java:176)
>> 	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
>> 	at org.apache.nutch.indexer.IndexingJob.main(IndexingJob.java:186)
>> 
>> 
>> 
>> 
>> Solr log: logs/solr.log
>> INFO  - 2014-12-03 06:53:18.776; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0
>> ERROR - 2014-12-03 06:53:18.777; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host'
>> 	at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185)
>> 	at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78)
>> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238)
>> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164)
>> 	at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
>> 	at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
>> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926)
>> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080)
>> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692)
>> 	at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
>> 	at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247)
>> 	at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174)
>> 	at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99)
>> 	at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
>> 	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>> 	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967)
>> 	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777)
>> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418)
>> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207)
>> 	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419)
>> 	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455)
>> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>> 	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557)
>> 	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>> 	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075)
>> 	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384)
>> 	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>> 	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009)
>> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>> 	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>> 	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>> 	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>> 	at org.eclipse.jetty.server.Server.handle(Server.java:368)
>> 	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489)
>> 	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>> 	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953)
>> 	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014)
>> 	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953)
>> 	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>> 	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>> 	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>> 	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>> 	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>> 	at java.lang.Thread.run(Thread.java:745)
>> INFO  - 2014-12-03 06:53:18.796; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0
>> ERROR - 2014-12-03 06:53:18.797; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host'
>> 	at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185)
>> 	at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78)
>> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238)
>> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164)
>> 	at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
>> 	at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
>> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926)
>> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080)
>> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692)
>> 	at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
>> 	at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247)
>> 	at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174)
>> 	at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99)
>> 	at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
>> 	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
>> 	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967)
>> 	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777)
>> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418)
>> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207)
>> 	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419)
>> 	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455)
>> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
>> 	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557)
>> 	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
>> 	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075)
>> 	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384)
>> 	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
>> 	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009)
>> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
>> 	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
>> 	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
>> 	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
>> 	at org.eclipse.jetty.server.Server.handle(Server.java:368)
>> 	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489)
>> 	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
>> 	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953)
>> 	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014)
>> 	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953)
>> 	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
>> 	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
>> 	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
>> 	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
>> 	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
>> 	at java.lang.Thread.run(Thread.java:745)
>> 
>> 
> 


Re: org.apache.solr.common.SolrException, unknown field 'host'

Posted by "Arthur.hk.chan@gmail.com" <ar...@gmail.com>.
can anyone help ?

On 3 Dec, 2014, at 7:03 am, Arthur.hk.chan@gmail.com <ar...@gmail.com> wrote:

> Hi,
> 
> I am new to Nutch and Solr, please help!!
> 
> I am using Nutch-1.9, Solr 4.10.2 and Hadoop 2.4.1
> I always get  org.apache.solr.common.SolrException, unknown field ‘host’, what would be wrong?
> 
> The schema.xml has <field name="host" type="string" stored="false" indexed="true"/>
> I have already copied [nutch] schema.xml to [solr] schema.xml, restarted solr, 
> From the indexing phase, Nutch always returns "unknown field ‘host’” error,  
> 
> Below are my settings. What would be wrong? 
> 
> Regards
> Arthur
> 
> 
> 
> 
> input_url/seed.txt 
> http://nutch.apache.org/
> 
> 
> 
> conf/solrindex-mapping.xml
> <?xml version="1.0" encoding="UTF-8"?>
> <mapping>
> 	<fields>
> 		<field dest="content" source="content"/>
> 		<field dest="title" source="title"/>
> 		<field dest="host" source="host"/>
> 		<field dest="segment" source="segment"/>
> 		<field dest="boost" source="boost"/>
> 		<field dest="digest" source="digest"/>
> 		<field dest="tstamp" source="tstamp"/>
>                 <field dest="subject" source="subject"/>
>                 <field dest="description" source="description"/>
>                 <field dest="comments" source="comments"/>
>                 <field dest="author" source="author"/>
>                 <field dest="keywords" source="keywords"/>
>                 <field dest="category" source="category"/> 
>                 <field dest="lastModified" source="lastModified"/>
> 	</fields>
> 	<uniqueKey>id</uniqueKey>
> </mapping>
> 
> 
> 
> conf/regex-urlfilter.txt
> # The default url filter.
> # Better for whole-internet crawling.
> # skip file: ftp: and mailto: urls
> -^(file|ftp|mailto):
> # skip image and other suffixes we can't yet parse
> # for a more extensive coverage use the urlfilter-suffix plugin
> -\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
> # skip URLs containing certain characters as probable queries, etc.
> -[?*!@=]
> # skip URLs with slash-delimited segment that repeats 3+ times, to break loops
> -.*(/[^/]+)/[^/]+\1/[^/]+\1/
> # accept anything else
> #+.
> +^http://([a-z0-9]*\.)*nutch.apache.org/
> 
> 
> 
> conf/nutch-site.xml
> <?xml version="1.0"?>
> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
> <!-- Put site-specific property overrides in this file. -->
> <configuration>
>  <property>
>   <name>http.agent.name</name>
>   <value>MyBot</value>
>  </property>
>  <property>
>   <name>http.robots.agents</name>
>   <value>MyBot,*</value>
>  </property>
>  <property>
>   <name>fetcher.store.content</name>
>   <value>true</value>
>  </property>
>  <property>
>   <name>fetcher.max.crawl.delay</name>
>   <value>-1</value>
>  </property>
>   <property>
>   <name>plugin.includes</name>
>   <value>protocol-http|urlfilter-regex|parse-(html|tika|metatags)|index-(basic|anchor|metadata)|query-(basic|site|url)|response-(json|xml)|summary-basic|scoring-opic|indexer-solr|urlnormalizer-(pass|regex|basic)</value>
>  </property>
>  <property>
>   <name>mapred.temp.dir</name>
>   <value>/tmp</value>
>  </property>
> <property>
> <name>metatags.names</name>
> <value>metatag.keywords;metatag.description</value>
> </property>
> <property>
>   <name>index.parse.md</name>
>   <value>metatag.description,metatag.keywords</value>
> </property>
> </configuration>
> 
> 
> 
> solr/collection1/conf/schema.xml
> <?xml version="1.0" encoding="UTF-8" ?>
> <schema name="nutch" version="1.5">
>     <types>
>         <fieldType name="string" class="solr.StrField" sortMissingLast="true"
>             omitNorms="true"/> 
>         <fieldType name="long" class="solr.TrieLongField" precisionStep="0"
>             omitNorms="true" positionIncrementGap="0"/>
>         <fieldType name="float" class="solr.TrieFloatField" precisionStep="0"
>             omitNorms="true" positionIncrementGap="0"/>
>         <fieldType name="date" class="solr.TrieDateField" precisionStep="0"
>             omitNorms="true" positionIncrementGap="0"/>
> 
>         <fieldType name="text" class="solr.TextField"
>             positionIncrementGap="100">
>             <analyzer>
>                 <tokenizer class="solr.WhitespaceTokenizerFactory"/>
>                 <filter class="solr.StopFilterFactory"
>                     ignoreCase="true" words="stopwords.txt"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
>                     generateWordParts="1" generateNumberParts="1"
>                     catenateWords="1" catenateNumbers="1" catenateAll="0"
>                     splitOnCaseChange="1"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                <!--  <filter class="solr.EnglishPorterFilterFactory"
>                     protected="protwords.txt"/> -->
>                 <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
>             </analyzer>
>         </fieldType>
>         <fieldType name="url" class="solr.TextField"
>             positionIncrementGap="100">
>             <analyzer>
>                 <tokenizer class="solr.StandardTokenizerFactory"/>
>                 <filter class="solr.LowerCaseFilterFactory"/>
>                 <filter class="solr.WordDelimiterFilterFactory"
>                     generateWordParts="1" generateNumberParts="1"/>
>             </analyzer>
>         </fieldType>
>     </types>
>     <fields>
> 
>       <field name="_root_" type="string" indexed="true" stored="false"/>
>         <field name="id" type="string" stored="true" indexed="true" required="true"/>
> 
>         <!-- core fields -->
>         <field name="_version_" type="long" indexed="true" stored="true"/>
>         <field name="host" type="string" stored="false" indexed="true"/>   
>         <field name="digest" type="string" stored="true" indexed="false"/>   
>         <field name="segment" type="string" stored="true" indexed="false"/>   
>         <field name="boost" type="float" stored="true" indexed="false"/>   
>         <field name="tstamp" type="date" stored="true" indexed="false"/>  
> 
>         <field name="url" type="text" indexed="true" stored="true" required="true"/> 
>         <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
>         <field name="last_modified" type="date" indexed="true" stored="true"/>
>         <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
> 
>         <!-- fields for the metatags plugin -->
>         <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
>         <field name="subject" type="text" indexed="true" stored="true"/>
>         <field name="description" type="text" stored="true" indexed="true"/>
>         <field name="comments" type="text" indexed="true" stored="true"/>
>         <field name="author" type="text" indexed="true" stored="true"/>
>         <field name="keywords" type="text" stored="true" indexed="true"/>
>         <field name="category" type="text" indexed="true" stored="true"/>
>         <field name="resourcename" type="text" indexed="true" stored="true"/>
> 
>         <!-- fields for index-basic plugin -->
>         <field name="content" type="text" indexed="true" stored="true" multiValued="true"/> 
>         <field name="title" type="text" stored="true" indexed="true"/>
>         <field name="cache" type="string" stored="true" indexed="false"/>
> 
>         <!-- fields for index-anchor plugin -->
>         <field name="anchor" type="string" stored="true" indexed="true"
>             multiValued="true"/>
> 
>         <!-- fields for index-more plugin -->
>         <field name="type" type="string" stored="true" indexed="true"
>             multiValued="true"/>
>         <field name="contentLength" type="long" stored="true"
>             indexed="false"/>
>         <field name="lastModified" type="date" stored="true"
>             indexed="false"/>
>         <field name="date" type="date" stored="true" indexed="true"/>
> 
>         <!-- fields for languageidentifier plugin -->
>         <field name="lang" type="string" stored="true" indexed="true"/>
> 
>         <!-- fields for subcollection plugin -->
>         <field name="subcollection" type="string" stored="true"
>             indexed="true" multiValued="true"/>
> 
>         <!-- fields for feed plugin (tag is also used by microformats-reltag)-->
>         <field name="author" type="string" stored="true" indexed="true"/>
>         <field name="tag" type="string" stored="true" indexed="true" multiValued="true"/>
>         <field name="feed" type="string" stored="true" indexed="true"/>
>         <field name="publishedDate" type="date" stored="true"
>             indexed="true"/>
>         <field name="updatedDate" type="date" stored="true"
>             indexed="true"/>
> 
>         <!-- fields for creativecommons plugin -->
>         <field name="cc" type="string" stored="true" indexed="true"
>             multiValued="true"/>
>             
>         <!-- fields for tld plugin -->    
>         <field name="tld" type="string" stored="false" indexed="false"/>
>     </fields>
>     <!--<uniqueKey>id</uniqueKey> -->
>     <uniqueKey>url</uniqueKey>
>     <defaultSearchField>content</defaultSearchField>
>     <solrQueryParser defaultOperator="OR"/>
> </schema>
> 
> 
> 
> 
> 
> Nutch log:
> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: starting at 2014-12-03 06:53:12
> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: linkdb: output_url/linkdb
> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: URL normalize: true
> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: URL filter: true
> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: internal links will be ignored.
> 2014-12-03 06:53:12,513 INFO  crawl.LinkDb - LinkDb: adding segment: output_url/segments/20141203064933
> 2014-12-03 06:53:13,688 INFO  crawl.LinkDb - LinkDb: merging with existing linkdb: output_url/linkdb
> 2014-12-03 06:53:14,755 INFO  crawl.LinkDb - LinkDb: finished at 2014-12-03 06:53:14, elapsed: 00:00:02
> 2014-12-03 06:53:15,085 INFO  crawl.DeduplicationJob - DeduplicationJob: starting at 2014-12-03 06:53:15
> 2014-12-03 06:53:15,204 WARN  util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
> 2014-12-03 06:53:16,365 INFO  crawl.DeduplicationJob - Deduplication: 0 documents marked as duplicates
> 2014-12-03 06:53:16,365 INFO  crawl.DeduplicationJob - Deduplication: Updating status of duplicate urls into crawl db.
> 2014-12-03 06:53:17,466 INFO  crawl.DeduplicationJob - Deduplication finished at 2014-12-03 06:53:17, elapsed: 00:00:02
> 2014-12-03 06:53:17,819 INFO  indexer.IndexingJob - Indexer: starting at 2014-12-03 06:53:17
> 2014-12-03 06:53:17,859 INFO  indexer.IndexingJob - Indexer: deleting gone documents: false
> 2014-12-03 06:53:17,860 INFO  indexer.IndexingJob - Indexer: URL filtering: false
> 2014-12-03 06:53:17,860 INFO  indexer.IndexingJob - Indexer: URL normalizing: false
> 2014-12-03 06:53:17,969 INFO  indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter
> 2014-12-03 06:53:17,969 INFO  indexer.IndexingJob - Active IndexWriters :
> SOLRIndexWriter
> 	solr.server.url : URL of the SOLR instance (mandatory)
> 	solr.commit.size : buffer size when sending to SOLR (default 1000)
> 	solr.mapping.file : name of the mapping file for fields (default solrindex-mapping.xml)
> 	solr.auth : use authentication (default false)
> 	solr.auth.username : use authentication (default false)
> 	solr.auth : username for authentication
> 	solr.auth.password : password for authentication
> 2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduce: crawldb: output_url/crawldb
> 2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduce: linkdb: output_url/linkdb
> 2014-12-03 06:53:17,971 INFO  indexer.IndexerMapReduce - IndexerMapReduces: adding segment: output_url/segments/20141203064933
> 2014-12-03 06:53:18,038 WARN  util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
> 2014-12-03 06:53:18,273 INFO  anchor.AnchorIndexingFilter - Anchor deduplication is: off
> 2014-12-03 06:53:18,657 INFO  indexer.IndexWriters - Adding org.apache.nutch.indexwriter.solr.SolrIndexWriter
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: content dest: content
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: title dest: title
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: host dest: host
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: segment dest: segment
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: boost dest: boost
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: digest dest: digest
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: tstamp dest: tstamp
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: subject dest: subject
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: description dest: description
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: comments dest: comments
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: author dest: author
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: keywords dest: keywords
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: category dest: category
> 2014-12-03 06:53:18,671 INFO  solr.SolrMappingReader - source: lastModified dest: lastModified
> 2014-12-03 06:53:18,742 INFO  solr.SolrIndexWriter - Indexing 39 documents
> 2014-12-03 06:53:18,793 INFO  solr.SolrIndexWriter - Indexing 39 documents
> 2014-12-03 06:53:18,805 WARN  mapred.LocalJobRunner - job_local637755932_0001
> org.apache.solr.common.SolrException: Bad Request
> Bad Request
> request: http://192.168.0.1:8983/solr/update?wt=javabin&version=2
> 	at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:430)
> 	at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:244)
> 	at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:105)
> 	at org.apache.nutch.indexwriter.solr.SolrIndexWriter.close(SolrIndexWriter.java:155)
> 	at org.apache.nutch.indexer.IndexWriters.close(IndexWriters.java:118)
> 	at org.apache.nutch.indexer.IndexerOutputFormat$1.close(IndexerOutputFormat.java:44)
> 	at org.apache.hadoop.mapred.ReduceTask$OldTrackingRecordWriter.close(ReduceTask.java:467)
> 	at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:535)
> 	at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:421)
> 	at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:398)
> 2014-12-03 06:53:19,196 ERROR indexer.IndexingJob - Indexer: java.io.IOException: Job failed!
> 	at org.apache.hadoop.mapred.JobClient.runJob(JobClient.java:1357)
> 	at org.apache.nutch.indexer.IndexingJob.index(IndexingJob.java:114)
> 	at org.apache.nutch.indexer.IndexingJob.run(IndexingJob.java:176)
> 	at org.apache.hadoop.util.ToolRunner.run(ToolRunner.java:65)
> 	at org.apache.nutch.indexer.IndexingJob.main(IndexingJob.java:186)
> 
> 
> 
> 
> Solr log: logs/solr.log
> INFO  - 2014-12-03 06:53:18.776; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0
> ERROR - 2014-12-03 06:53:18.777; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host'
> 	at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185)
> 	at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78)
> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238)
> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164)
> 	at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
> 	at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926)
> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080)
> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692)
> 	at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
> 	at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247)
> 	at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174)
> 	at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99)
> 	at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> 	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> 	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967)
> 	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777)
> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418)
> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207)
> 	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419)
> 	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455)
> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> 	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557)
> 	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> 	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075)
> 	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384)
> 	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> 	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009)
> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> 	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> 	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> 	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> 	at org.eclipse.jetty.server.Server.handle(Server.java:368)
> 	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489)
> 	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> 	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953)
> 	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014)
> 	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953)
> 	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> 	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> 	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> 	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> 	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> 	at java.lang.Thread.run(Thread.java:745)
> INFO  - 2014-12-03 06:53:18.796; org.apache.solr.update.processor.LogUpdateProcessor; [collection1] webapp=/solr path=/update params={wt=javabin&version=2} {} 0 0
> ERROR - 2014-12-03 06:53:18.797; org.apache.solr.common.SolrException; org.apache.solr.common.SolrException: ERROR: [doc=http://nutch.apache.org/apidocs/apidocs-1.1/allclasses-frame.html] unknown field 'host'
> 	at org.apache.solr.update.DocumentBuilder.toDocument(DocumentBuilder.java:185)
> 	at org.apache.solr.update.AddUpdateCommand.getLuceneDocument(AddUpdateCommand.java:78)
> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc0(DirectUpdateHandler2.java:238)
> 	at org.apache.solr.update.DirectUpdateHandler2.addDoc(DirectUpdateHandler2.java:164)
> 	at org.apache.solr.update.processor.RunUpdateProcessor.processAdd(RunUpdateProcessorFactory.java:69)
> 	at org.apache.solr.update.processor.UpdateRequestProcessor.processAdd(UpdateRequestProcessor.java:51)
> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.doLocalAdd(DistributedUpdateProcessor.java:926)
> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.versionAdd(DistributedUpdateProcessor.java:1080)
> 	at org.apache.solr.update.processor.DistributedUpdateProcessor.processAdd(DistributedUpdateProcessor.java:692)
> 	at org.apache.solr.update.processor.LogUpdateProcessor.processAdd(LogUpdateProcessorFactory.java:100)
> 	at org.apache.solr.handler.loader.XMLLoader.processUpdate(XMLLoader.java:247)
> 	at org.apache.solr.handler.loader.XMLLoader.load(XMLLoader.java:174)
> 	at org.apache.solr.handler.UpdateRequestHandler$1.load(UpdateRequestHandler.java:99)
> 	at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> 	at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:135)
> 	at org.apache.solr.core.SolrCore.execute(SolrCore.java:1967)
> 	at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:777)
> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:418)
> 	at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:207)
> 	at org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1419)
> 	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:455)
> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:137)
> 	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:557)
> 	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:231)
> 	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1075)
> 	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:384)
> 	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:193)
> 	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1009)
> 	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:135)
> 	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:255)
> 	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:154)
> 	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:116)
> 	at org.eclipse.jetty.server.Server.handle(Server.java:368)
> 	at org.eclipse.jetty.server.AbstractHttpConnection.handleRequest(AbstractHttpConnection.java:489)
> 	at org.eclipse.jetty.server.BlockingHttpConnection.handleRequest(BlockingHttpConnection.java:53)
> 	at org.eclipse.jetty.server.AbstractHttpConnection.content(AbstractHttpConnection.java:953)
> 	at org.eclipse.jetty.server.AbstractHttpConnection$RequestHandler.content(AbstractHttpConnection.java:1014)
> 	at org.eclipse.jetty.http.HttpParser.parseNext(HttpParser.java:953)
> 	at org.eclipse.jetty.http.HttpParser.parseAvailable(HttpParser.java:240)
> 	at org.eclipse.jetty.server.BlockingHttpConnection.handle(BlockingHttpConnection.java:72)
> 	at org.eclipse.jetty.server.bio.SocketConnector$ConnectorEndPoint.run(SocketConnector.java:264)
> 	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:608)
> 	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:543)
> 	at java.lang.Thread.run(Thread.java:745)
> 
>