You are viewing a plain text version of this content. The canonical link for it is here.
Posted to user@nutch.apache.org by Cam Bazz <ca...@gmail.com> on 2011/07/08 08:29:17 UTC
solr indexing error
Hello,
Finally I have finished crawling yesterday at midnight, and wanted to
index to solr.
So I have used the command:
bin/nutch solrindex http://localhost:8983/solr /home/crawl/crawldb
/home/crawl/linkdb -dir /home/crawl/segments
and next morning unfortunately the indexing was failed. I guess this
is solr specific. And I also think that one of my crawled page
contains illegal characters or something.
Here is the log excerpt from hadoop.log:
Any ideas / help / recomendation greatly appreciated.
Best Regards.
org.apache.solr.common.SolrException: [was class
java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
char #1387083, byte #1464613) java.lang.RuntimeException: [was class
java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
char #1387083, byte #1464613) at
com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287)
at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146)
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1212)
at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
at org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:230)
at org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:114)
at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
at org.mortbay.jetty.Server.handle(Server.java:326) at
org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
at
[was class java.io.CharConversionException] Invalid UTF-8 character
0xfffe at char #1387083, byte #1464613) java.lang.RuntimeException:
[was class java.io.CharConversionException] Invalid UTF-8 character
0xfffe at char #1387083, byte #1464613) at
com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287)
at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146)
at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1212)
at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
at org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:230)
at org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:114)
at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
at org.mortbay.jetty.Server.handle(Server.java:326) at
org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
at
request: http://localhost:8983/solr/update?wt=javabin&version=2
at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:436)
at org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHttpSolrServer.java:245)
at org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(AbstractUpdateRequest.java:105)
at org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:49)
at org.apache.nutch.indexer.solr.SolrWriter.write(SolrWriter.java:71)
at org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat.java:54)
at org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat.java:44)
at org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:440)
at org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:159)
at org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:50)
at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:463)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:411)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:216)
2011-07-08 00:50:53,508 ERROR solr.SolrIndexer - java.io.IOException:
Job failed!
Re: solr indexing error
Posted by Markus Jelsma <ma...@openindex.io>.
Patch applies to both. You can compile and use it safely on 1.3
> Hello Markus,
>
> Is this patch for nutch 1.3 or some development version (1.4.4)
>
> Best Regards,
> C.B.
>
> On Fri, Jul 8, 2011 at 11:33 AM, Markus Jelsma
>
> <ma...@openindex.io> wrote:
> > You're the third in a row with this error. Did we introduce something
> > strange in 1.3? Anyhow, here's a patch to work around the problem:
> >
> > https://issues.apache.org/jira/browse/NUTCH-1016
> >
> >> Hello,
> >>
> >> Finally I have finished crawling yesterday at midnight, and wanted to
> >> index to solr.
> >>
> >> So I have used the command:
> >>
> >> bin/nutch solrindex http://localhost:8983/solr /home/crawl/crawldb
> >> /home/crawl/linkdb -dir /home/crawl/segments
> >>
> >> and next morning unfortunately the indexing was failed. I guess this
> >> is solr specific. And I also think that one of my crawled page
> >> contains illegal characters or something.
> >>
> >> Here is the log excerpt from hadoop.log:
> >>
> >> Any ideas / help / recomendation greatly appreciated.
> >> Best Regards.
> >>
> >> org.apache.solr.common.SolrException: [was class
> >> java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
> >> char #1387083, byte #1464613) java.lang.RuntimeException: [was class
> >> java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
> >> char #1387083, byte #1464613) at
> >> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java
> >> :18 ) at
> >> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
> >> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java
> >> :3 657) at
> >> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> >> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> >> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> >> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
> >> at
> >> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Conte
> >> nt StreamHandlerBase.java:67) at
> >> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerB
> >> as e.java:129) at
> >> org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> >> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.ja
> >> va
> >>
> >> :356) at
> >>
> >> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.j
> >> av a:252) at
> >> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHan
> >> dl er.java:1212) at
> >> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> >> at
> >> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:2
> >> 16 ) at
> >> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> >> at
> >> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> >> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> >> at
> >> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandler
> >> Co llection.java:230) at
> >> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.jav
> >> a: 114) at
> >> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> >> at org.mortbay.jetty.Server.handle(Server.java:326) at
> >> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> >> at
> >>
> >> [was class java.io.CharConversionException] Invalid UTF-8 character
> >> 0xfffe at char #1387083, byte #1464613) java.lang.RuntimeException:
> >> [was class java.io.CharConversionException] Invalid UTF-8 character
> >> 0xfffe at char #1387083, byte #1464613) at
> >> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java
> >> :18 ) at
> >> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
> >> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java
> >> :3 657) at
> >> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> >> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> >> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> >> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
> >> at
> >> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Conte
> >> nt StreamHandlerBase.java:67) at
> >> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerB
> >> as e.java:129) at
> >> org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> >> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.ja
> >> va
> >>
> >> :356) at
> >>
> >> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.j
> >> av a:252) at
> >> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHan
> >> dl er.java:1212) at
> >> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> >> at
> >> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:2
> >> 16 ) at
> >> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> >> at
> >> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> >> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> >> at
> >> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandler
> >> Co llection.java:230) at
> >> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.jav
> >> a: 114) at
> >> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> >> at org.mortbay.jetty.Server.handle(Server.java:326) at
> >> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> >> at
> >>
> >> request: http://localhost:8983/solr/update?wt=javabin&version=2
> >> at
> >> org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsH
> >> tt pSolrServer.java:436) at
> >> org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsH
> >> tt pSolrServer.java:245) at
> >> org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(Abstr
> >> ac tUpdateRequest.java:105) at
> >> org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:49) at
> >> org.apache.nutch.indexer.solr.SolrWriter.write(SolrWriter.java:71) at
> >> org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat
> >> .j ava:54) at
> >> org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat
> >> .j ava:44) at
> >> org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:440) at
> >> org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:1
> >> 59 ) at
> >> org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:5
> >> 0) at
> >> org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:463)
> >> at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:411) at
> >> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:216
> >> ) 2011-07-08 00:50:53,508 ERROR solr.SolrIndexer - java.io.IOException:
> >> Job failed!
Re: solr indexing error
Posted by Cam Bazz <ca...@gmail.com>.
Hello Markus,
Is this patch for nutch 1.3 or some development version (1.4.4)
Best Regards,
C.B.
On Fri, Jul 8, 2011 at 11:33 AM, Markus Jelsma
<ma...@openindex.io> wrote:
> You're the third in a row with this error. Did we introduce something strange
> in 1.3? Anyhow, here's a patch to work around the problem:
>
> https://issues.apache.org/jira/browse/NUTCH-1016
>
>> Hello,
>>
>> Finally I have finished crawling yesterday at midnight, and wanted to
>> index to solr.
>>
>> So I have used the command:
>>
>> bin/nutch solrindex http://localhost:8983/solr /home/crawl/crawldb
>> /home/crawl/linkdb -dir /home/crawl/segments
>>
>> and next morning unfortunately the indexing was failed. I guess this
>> is solr specific. And I also think that one of my crawled page
>> contains illegal characters or something.
>>
>> Here is the log excerpt from hadoop.log:
>>
>> Any ideas / help / recomendation greatly appreciated.
>> Best Regards.
>>
>> org.apache.solr.common.SolrException: [was class
>> java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
>> char #1387083, byte #1464613) java.lang.RuntimeException: [was class
>> java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
>> char #1387083, byte #1464613) at
>> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18
>> ) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
>> at
>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
>> 657) at
>> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
>> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
>> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
>> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
>> at
>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Content
>> StreamHandlerBase.java:67) at
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBas
>> e.java:129) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
>> at
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java
>> :356) at
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.jav
>> a:252) at
>> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandl
>> er.java:1212) at
>> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
>> at
>> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216
>> ) at
>> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
>> at
>> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
>> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
>> at
>> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCo
>> llection.java:230) at
>> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:
>> 114) at
>> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
>> at org.mortbay.jetty.Server.handle(Server.java:326) at
>> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
>> at
>>
>> [was class java.io.CharConversionException] Invalid UTF-8 character
>> 0xfffe at char #1387083, byte #1464613) java.lang.RuntimeException:
>> [was class java.io.CharConversionException] Invalid UTF-8 character
>> 0xfffe at char #1387083, byte #1464613) at
>> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18
>> ) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
>> at
>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
>> 657) at
>> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
>> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
>> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
>> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
>> at
>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Content
>> StreamHandlerBase.java:67) at
>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBas
>> e.java:129) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
>> at
>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java
>> :356) at
>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.jav
>> a:252) at
>> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandl
>> er.java:1212) at
>> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
>> at
>> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216
>> ) at
>> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
>> at
>> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
>> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
>> at
>> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCo
>> llection.java:230) at
>> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:
>> 114) at
>> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
>> at org.mortbay.jetty.Server.handle(Server.java:326) at
>> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
>> at
>>
>> request: http://localhost:8983/solr/update?wt=javabin&version=2
>> at
>> org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHtt
>> pSolrServer.java:436) at
>> org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHtt
>> pSolrServer.java:245) at
>> org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(Abstrac
>> tUpdateRequest.java:105) at
>> org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:49) at
>> org.apache.nutch.indexer.solr.SolrWriter.write(SolrWriter.java:71) at
>> org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat.j
>> ava:54) at
>> org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat.j
>> ava:44) at
>> org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:440) at
>> org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:159
>> ) at
>> org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:50)
>> at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:463)
>> at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:411) at
>> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:216)
>> 2011-07-08 00:50:53,508 ERROR solr.SolrIndexer - java.io.IOException: Job
>> failed!
>
Re: solr indexing error
Posted by Markus Jelsma <ma...@openindex.io>.
You're the third in a row with this error. Did we introduce something strange
in 1.3? Anyhow, here's a patch to work around the problem:
https://issues.apache.org/jira/browse/NUTCH-1016
> Hello,
>
> Finally I have finished crawling yesterday at midnight, and wanted to
> index to solr.
>
> So I have used the command:
>
> bin/nutch solrindex http://localhost:8983/solr /home/crawl/crawldb
> /home/crawl/linkdb -dir /home/crawl/segments
>
> and next morning unfortunately the indexing was failed. I guess this
> is solr specific. And I also think that one of my crawled page
> contains illegal characters or something.
>
> Here is the log excerpt from hadoop.log:
>
> Any ideas / help / recomendation greatly appreciated.
> Best Regards.
>
> org.apache.solr.common.SolrException: [was class
> java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
> char #1387083, byte #1464613) java.lang.RuntimeException: [was class
> java.io.CharConversionException] Invalid UTF-8 character 0xfffe at
> char #1387083, byte #1464613) at
> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18
> ) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
> at
> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
> 657) at
> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
> at
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Content
> StreamHandlerBase.java:67) at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBas
> e.java:129) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
> at
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java
> :356) at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.jav
> a:252) at
> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandl
> er.java:1212) at
> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> at
> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216
> ) at
> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> at
> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> at
> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCo
> llection.java:230) at
> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:
> 114) at
> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> at org.mortbay.jetty.Server.handle(Server.java:326) at
> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> at
>
> [was class java.io.CharConversionException] Invalid UTF-8 character
> 0xfffe at char #1387083, byte #1464613) java.lang.RuntimeException:
> [was class java.io.CharConversionException] Invalid UTF-8 character
> 0xfffe at char #1387083, byte #1464613) at
> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18
> ) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
> at
> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
> 657) at
> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
> at
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Content
> StreamHandlerBase.java:67) at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBas
> e.java:129) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
> at
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java
> :356) at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.jav
> a:252) at
> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandl
> er.java:1212) at
> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> at
> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216
> ) at
> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> at
> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> at
> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCo
> llection.java:230) at
> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:
> 114) at
> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> at org.mortbay.jetty.Server.handle(Server.java:326) at
> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> at
>
> request: http://localhost:8983/solr/update?wt=javabin&version=2
> at
> org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHtt
> pSolrServer.java:436) at
> org.apache.solr.client.solrj.impl.CommonsHttpSolrServer.request(CommonsHtt
> pSolrServer.java:245) at
> org.apache.solr.client.solrj.request.AbstractUpdateRequest.process(Abstrac
> tUpdateRequest.java:105) at
> org.apache.solr.client.solrj.SolrServer.add(SolrServer.java:49) at
> org.apache.nutch.indexer.solr.SolrWriter.write(SolrWriter.java:71) at
> org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat.j
> ava:54) at
> org.apache.nutch.indexer.IndexerOutputFormat$1.write(IndexerOutputFormat.j
> ava:44) at
> org.apache.hadoop.mapred.ReduceTask$3.collect(ReduceTask.java:440) at
> org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:159
> ) at
> org.apache.nutch.indexer.IndexerMapReduce.reduce(IndexerMapReduce.java:50)
> at org.apache.hadoop.mapred.ReduceTask.runOldReducer(ReduceTask.java:463)
> at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:411) at
> org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:216)
> 2011-07-08 00:50:53,508 ERROR solr.SolrIndexer - java.io.IOException: Job
> failed!