You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Markus Jelsma <ma...@openindex.io> on 2011/06/27 12:40:16 UTC

Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Hi,

I came across the indexing error below. It happened in a huge batch update 
from Nutch with SolrJ 3.1. Since the crawl was huge it is very hard to trace 
the error back to a specific document. So i try my luck here: anyone seen this 
before with SolrJ 3.1? Anything else on the Nutch part i should have taken 
care off?

Thanks!


Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423 
Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
        at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
        at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
        at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
        at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
        at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287)
        at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146)
        at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
        at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
        at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
        at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
        at orJun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423 
Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
        at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
        at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
        at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
        at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
        at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287)
        at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146)
        at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
        at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
        at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
        at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
        at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
        at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
        at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1212)
        at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
        at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
        at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
        at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
        at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
        at org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:230)
        at org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:114)
        at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
        at org.mortbay.jetty.Server.handle(Server.java:326)
        at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
        at org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:945)
        at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
        at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218)
        at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404)
        at org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:228)
        at org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:582)
Caused by: java.io.CharConversionException: Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
        at com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335)
        at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
        at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
        at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
        at com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:57)
        at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
        at com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:4628)
        at com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126)
        at com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
        at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649)
        ... 26 moreg.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
        at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1212)
        at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
        at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
        at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
        at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
        at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
        at org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:230)
        at org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:114)
        at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
        at org.mortbay.jetty.Server.handle(Server.java:326)
        at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
        at org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:945)
        at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
        at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218)
        at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404)
        at org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:228)
        at org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:582)
Caused by: java.io.CharConversionException: Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
        at com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335)
        at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
        at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
        at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
        at com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:57)
        at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
        at com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:4628)
        at com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126)
        at com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
        at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649)
        ... 26 more

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Mike Sokolov <so...@ifactory.com>.
OK - re-reading your message it seems maybe that is what you were trying 
to say too, Robert.  FWIW I agree with you that XML is rigid, sometimes 
for purely arbitrary reasons.  But nobody has really helped Markus here 
- unfortunately, there is no easy way out of this mess.  What I do to 
handle issues like this is to wrap the stream I'm handing to the parser 
in some kind of cleanup stream that handles a few yucky issues.  You 
could, eg, just strip out invalid XML characters.  Maybe Nutch should be 
doing this, or at least handling the error better?

-Mike

On 06/27/2011 09:19 AM, Mike Sokolov wrote:
> Actually - you are both wrong!
>
> It is true that 0xffff is a valid UTF8 character, and not a valid UTF8 
> byte sequence.
>
> But the parser is reporting (or trying to) that 0xffff is an invalid 
> XML character.
>
> And Robert - if the wording offends you, you might want to send a note 
> to Tatu (http://jira.codehaus.org/) suggesting that he alter the 
> wording of the error message :)
>
> -Mike
>
> On 06/27/2011 09:01 AM, Bernd Fehling wrote:
>>
>>
>> Am 27.06.2011 14:48, schrieb Robert Muir:
>>> On Mon, Jun 27, 2011 at 8:47 AM, Bernd Fehling
>>> <be...@uni-bielefeld.de>  wrote:
>>>
>>>>
>>>> correct!!!
>>>>
>>>
>>> but what i said, is totally different than what you said.
>>>
>>> you are still wrong.
>>
>> http://www.unicode.org/faq//utf_bom.html
>>
>> see Q: What is a UTF?
>>

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Mike Sokolov <so...@ifactory.com>.
Actually - you are both wrong!

It is true that 0xffff is a valid UTF8 character, and not a valid UTF8 
byte sequence.

But the parser is reporting (or trying to) that 0xffff is an invalid XML 
character.

And Robert - if the wording offends you, you might want to send a note 
to Tatu (http://jira.codehaus.org/) suggesting that he alter the wording 
of the error message :)

-Mike

On 06/27/2011 09:01 AM, Bernd Fehling wrote:
>
>
> Am 27.06.2011 14:48, schrieb Robert Muir:
>> On Mon, Jun 27, 2011 at 8:47 AM, Bernd Fehling
>> <be...@uni-bielefeld.de>  wrote:
>>
>>>
>>> correct!!!
>>>
>>
>> but what i said, is totally different than what you said.
>>
>> you are still wrong.
>
> http://www.unicode.org/faq//utf_bom.html
>
> see Q: What is a UTF?
>

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Bernd Fehling <be...@uni-bielefeld.de>.

Am 27.06.2011 14:48, schrieb Robert Muir:
> On Mon, Jun 27, 2011 at 8:47 AM, Bernd Fehling
> <be...@uni-bielefeld.de>  wrote:
>
>>
>> correct!!!
>>
>
> but what i said, is totally different than what you said.
>
> you are still wrong.

http://www.unicode.org/faq//utf_bom.html

see Q: What is a UTF?


Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Robert Muir <rc...@gmail.com>.
On Mon, Jun 27, 2011 at 8:47 AM, Bernd Fehling
<be...@uni-bielefeld.de> wrote:

>
> correct!!!
>

but what i said, is totally different than what you said.

you are still wrong.

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Bernd Fehling <be...@uni-bielefeld.de>.
Am 27.06.2011 14:35, schrieb Robert Muir:
> On Mon, Jun 27, 2011 at 8:30 AM, Bernd Fehling
> <be...@uni-bielefeld.de>  wrote:
>
>> Unicode U+FFFF ist UTF-8 byte sequence "ef bf bf" that is right.
>>
>> But I was saying that UTF-8 0xffff (which is byte sequence "ff ff") is
>> illegal
>> and that's what the java.io.CharConversionException is complaining about.
>> "Invalid UTF-8 character 0xffff".
>>
>> Don't mix up Unicode with UTF-8.
>>
>> Sorry, but think are wrong ;-)
>>
>
> Hi, there is no such thing as "UTF-8 0xffff", nor is there any such
> thing as "utf-8 character", despite what this xml parser might say.
>
> This is just a stupid XML parser, like other stupid things about XML,
> it says 'illegal this' or 'illegal that' for arbitrary sets of unicode
> (such as control characters).
>
> You can tell the XML parser is totally broken, when it uses the phrase
> 'utf-8 character'. this term does not exist.

correct!!!

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Robert Muir <rc...@gmail.com>.
On Mon, Jun 27, 2011 at 8:30 AM, Bernd Fehling
<be...@uni-bielefeld.de> wrote:

> Unicode U+FFFF ist UTF-8 byte sequence "ef bf bf" that is right.
>
> But I was saying that UTF-8 0xffff (which is byte sequence "ff ff") is
> illegal
> and that's what the java.io.CharConversionException is complaining about.
> "Invalid UTF-8 character 0xffff".
>
> Don't mix up Unicode with UTF-8.
>
> Sorry, but think are wrong ;-)
>

Hi, there is no such thing as "UTF-8 0xffff", nor is there any such
thing as "utf-8 character", despite what this xml parser might say.

This is just a stupid XML parser, like other stupid things about XML,
it says 'illegal this' or 'illegal that' for arbitrary sets of unicode
(such as control characters).

You can tell the XML parser is totally broken, when it uses the phrase
'utf-8 character'. this term does not exist.

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Bernd Fehling <be...@uni-bielefeld.de>.
Am 27.06.2011 14:02, schrieb Robert Muir:
> On Mon, Jun 27, 2011 at 7:11 AM, Bernd Fehling
> <be...@uni-bielefeld.de>  wrote:
>
>>
>> So there is no UTF-8 0xffff. It is illegal.
>>
>
> you are wrong: it is legally encoded as a three byte sequence: ef bf bf

Unicode U+FFFF ist UTF-8 byte sequence "ef bf bf" that is right.

But I was saying that UTF-8 0xffff (which is byte sequence "ff ff") is illegal
and that's what the java.io.CharConversionException is complaining about.
"Invalid UTF-8 character 0xffff".

Don't mix up Unicode with UTF-8.

Sorry, but think are wrong ;-)

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Robert Muir <rc...@gmail.com>.
On Mon, Jun 27, 2011 at 7:11 AM, Bernd Fehling
<be...@uni-bielefeld.de> wrote:

>
> So there is no UTF-8 0xffff. It is illegal.
>

you are wrong: it is legally encoded as a three byte sequence: ef bf bf

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Bernd Fehling <be...@uni-bielefeld.de>.
I suggest avoid illegal UTF-8 characters by pre-filtering your
contentstream before loading.

Unicode   UTF-8(hex)
U+07FF    df bf
U+0800    e0 a0 80

So there is no UTF-8 0xffff. It is illegal.

Regards


Am 27.06.2011 12:40, schrieb Markus Jelsma:
> Hi,
>
> I came across the indexing error below. It happened in a huge batch update
> from Nutch with SolrJ 3.1. Since the crawl was huge it is very hard to trace
> the error back to a specific document. So i try my luck here: anyone seen this
> before with SolrJ 3.1? Anything else on the Nutch part i should have taken
> care off?
>
> Thanks!
>
>
> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423
> Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
> SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>          at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
>          at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
>          at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
>          at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
>          at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287)
>          at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146)
>          at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
>          at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
>          at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
>          at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
>          at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
>          at orJun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423
> Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
> SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>          at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
>          at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
>          at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3657)
>          at com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
>          at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287)
>          at org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146)
>          at org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77)
>          at org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:67)
>          at org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:129)
>          at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368)
>          at org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java:356)
>          at org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
>          at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1212)
>          at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
>          at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
>          at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
>          at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
>          at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
>          at org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:230)
>          at org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:114)
>          at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
>          at org.mortbay.jetty.Server.handle(Server.java:326)
>          at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
>          at org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:945)
>          at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
>          at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218)
>          at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404)
>          at org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:228)
>          at org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:582)
> Caused by: java.io.CharConversionException: Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>          at com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335)
>          at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
>          at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
>          at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
>          at com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:57)
>          at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
>          at com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:4628)
>          at com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126)
>          at com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
>          at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649)
>          ... 26 moreg.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:252)
>          at org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1212)
>          at org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
>          at org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216)
>          at org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
>          at org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
>          at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
>          at org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:230)
>          at org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:114)
>          at org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
>          at org.mortbay.jetty.Server.handle(Server.java:326)
>          at org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
>          at org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.java:945)
>          at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
>          at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218)
>          at org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404)
>          at org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:228)
>          at org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:582)
> Caused by: java.io.CharConversionException: Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>          at com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335)
>          at com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
>          at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
>          at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
>          at com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:57)
>          at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
>          at com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java:4628)
>          at com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java:4126)
>          at com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
>          at com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3649)
>          ... 26 more
>

-- 
*************************************************************
Bernd Fehling                Universitätsbibliothek Bielefeld
Dipl.-Inform. (FH)                        Universitätsstr. 25
Tel. +49 521 106-4060                   Fax. +49 521 106-4052
bernd.fehling@uni-bielefeld.de                33615 Bielefeld

BASE - Bielefeld Academic Search Engine - www.base-search.net
*************************************************************

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by ramires <uy...@beriltech.com>.
hı 

 Its the same error I mentioned here 
http://lucene.472066.n3.nabble.com/strange-utf-8-problem-td3094473.html.
Also if you use solr 1.4.1 there is no problem like that.


--
View this message in context: http://lucene.472066.n3.nabble.com/Solr-3-1-indexing-error-Invalid-UTF-8-character-0xffff-tp3113191p3113864.html
Sent from the Solr - User mailing list archive at Nabble.com.

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Mike Sokolov <so...@ifactory.com>.
I don't think this is a BOM - that would be 0xfeff.  Anyway the problem 
we usually see w/processing XML with BOMs is in UTF8 (which really 
doesn't need a BOM since it's a byte stream anyway), in which if you 
transform the stream (bytes) into a reader (chars) before the xml parser 
can see it, the parser treats the BOM as white space.  But in that case 
you typically get a more specific error about invalid characters in the 
XML prolog, not just a random invalid character error.

-Mike

On 06/27/2011 10:33 AM, lee carroll wrote:
> Hi Markus
>
> I've seen similar issue before (but not with solr) when processing files as xml.
> In our case the problem was due to processing a utf16 file with a byte
> order mark. This presents itself as
> 0xffff to the xml parser which is not used by utf8 (the bom unicode
> would be represented as efbfbf in utf8) This caused the utf8
> aware parser to choke.
>
> I don't want to get involved in any unicode / utf war as I'm confused
> enough as it stands but
> could you check for utf16 files before processing ?
>
> lee c
>
> On 27 June 2011 14:26, Thomas Fischer<fi...@aon.at>  wrote:
>    
>> Hello,
>>
>> Am 27.06.2011 um 12:40 schrieb Markus Jelsma:
>>
>>      
>>> Hi,
>>>
>>> I came across the indexing error below. It happened in a huge batch update
>>> from Nutch with SolrJ 3.1. Since the crawl was huge it is very hard to trace
>>> the error back to a specific document. So i try my luck here: anyone seen this
>>> before with SolrJ 3.1? Anything else on the Nutch part i should have taken
>>> care off?
>>>
>>> Thanks!
>>>
>>>
>>> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
>>> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423
>>> Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
>>> SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>>>        at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
>>>        
>> and loads of other rubbish and
>>
>>      
>>>        ... 26 more
>>>        
>>
>> I see this as a problem of solr error-reporting. This is not only obnoxiously "loud" (white on grey with oversized fonts), but less useful than it should be.
>> Instead of telling the user where the error occurred (i.e. while reading which file, which column at which line) it unravels the stack. This is useless if the program just choked on some unexpected input, like a typo in a schema of config file or an invalid character in a file to be indexed.
>> I don't know if this is due to the Tomcat, the logging system of solr itself, but it is annoying.
>>
>> And yes, I've seen something like this before and found the error not by inspecting solr but by opening the suspected files with an appropriate browser (e.g. Firefox) which tells me exactly where something goes wrong.
>>
>> All the best
>> Thomas
>>
>>
>>      

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Markus Jelsma <ma...@openindex.io>.

On Monday 27 June 2011 16:33:16 lee carroll wrote:
> Hi Markus
> 
> I've seen similar issue before (but not with solr) when processing files as
> xml. In our case the problem was due to processing a utf16 file with a
> byte order mark. This presents itself as
> 0xffff to the xml parser which is not used by utf8 (the bom unicode
> would be represented as efbfbf in utf8) This caused the utf8
> aware parser to choke.
> 
> I don't want to get involved in any unicode / utf war as I'm confused
> enough as it stands but
> could you check for utf16 files before processing ?

Some files may be UTF-16 but i cannot confirm it right now. On the other hand, 
Nutch should have no trouble processing UTF-16.

> 
> lee c
> 
> On 27 June 2011 14:26, Thomas Fischer <fi...@aon.at> wrote:
> > Hello,
> > 
> > Am 27.06.2011 um 12:40 schrieb Markus Jelsma:
> >> Hi,
> >> 
> >> I came across the indexing error below. It happened in a huge batch
> >> update from Nutch with SolrJ 3.1. Since the crawl was huge it is very
> >> hard to trace the error back to a specific document. So i try my luck
> >> here: anyone seen this before with SolrJ 3.1? Anything else on the
> >> Nutch part i should have taken care off?
> >> 
> >> Thanks!
> >> 
> >> 
> >> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> >> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
> >> status=500 QTime=423 Jun 27, 2011 10:24:28 AM
> >> org.apache.solr.common.SolrException log SEVERE:
> >> java.lang.RuntimeException: [was class java.io.CharConversionException]
> >> Invalid UTF-8 character 0xffff at char #1142033, byte #1155068) at
> >> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.jav
> >> a:18)
> > 
> > and loads of other rubbish and
> > 
> >>       ... 26 more
> > 
> > I see this as a problem of solr error-reporting. This is not only
> > obnoxiously "loud" (white on grey with oversized fonts), but less useful
> > than it should be. Instead of telling the user where the error occurred
> > (i.e. while reading which file, which column at which line) it unravels
> > the stack. This is useless if the program just choked on some unexpected
> > input, like a typo in a schema of config file or an invalid character in
> > a file to be indexed. I don't know if this is due to the Tomcat, the
> > logging system of solr itself, but it is annoying.
> > 
> > And yes, I've seen something like this before and found the error not by
> > inspecting solr but by opening the suspected files with an appropriate
> > browser (e.g. Firefox) which tells me exactly where something goes
> > wrong.
> > 
> > All the best
> > Thomas

-- 
Markus Jelsma - CTO - Openindex
http://www.linkedin.com/in/markus17
050-8536620 / 06-50258350

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by lee carroll <le...@googlemail.com>.
Hi Markus

I've seen similar issue before (but not with solr) when processing files as xml.
In our case the problem was due to processing a utf16 file with a byte
order mark. This presents itself as
0xffff to the xml parser which is not used by utf8 (the bom unicode
would be represented as efbfbf in utf8) This caused the utf8
aware parser to choke.

I don't want to get involved in any unicode / utf war as I'm confused
enough as it stands but
could you check for utf16 files before processing ?

lee c

On 27 June 2011 14:26, Thomas Fischer <fi...@aon.at> wrote:
> Hello,
>
> Am 27.06.2011 um 12:40 schrieb Markus Jelsma:
>
>> Hi,
>>
>> I came across the indexing error below. It happened in a huge batch update
>> from Nutch with SolrJ 3.1. Since the crawl was huge it is very hard to trace
>> the error back to a specific document. So i try my luck here: anyone seen this
>> before with SolrJ 3.1? Anything else on the Nutch part i should have taken
>> care off?
>>
>> Thanks!
>>
>>
>> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
>> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423
>> Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
>> SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>>       at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)
>
> and loads of other rubbish and
>
>>       ... 26 more
>
>
> I see this as a problem of solr error-reporting. This is not only obnoxiously "loud" (white on grey with oversized fonts), but less useful than it should be.
> Instead of telling the user where the error occurred (i.e. while reading which file, which column at which line) it unravels the stack. This is useless if the program just choked on some unexpected input, like a typo in a schema of config file or an invalid character in a file to be indexed.
> I don't know if this is due to the Tomcat, the logging system of solr itself, but it is annoying.
>
> And yes, I've seen something like this before and found the error not by inspecting solr but by opening the suspected files with an appropriate browser (e.g. Firefox) which tells me exactly where something goes wrong.
>
> All the best
> Thomas
>
>

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Thomas Fischer <fi...@aon.at>.
Hello,

Am 27.06.2011 um 12:40 schrieb Markus Jelsma:

> Hi,
> 
> I came across the indexing error below. It happened in a huge batch update 
> from Nutch with SolrJ 3.1. Since the crawl was huge it is very hard to trace 
> the error back to a specific document. So i try my luck here: anyone seen this 
> before with SolrJ 3.1? Anything else on the Nutch part i should have taken 
> care off?
> 
> Thanks!
> 
> 
> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500 QTime=423 
> Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException log
> SEVERE: java.lang.RuntimeException: [was class java.io.CharConversionException] Invalid UTF-8 character 0xffff at char #1142033, byte #1155068)
>       at com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:18)

and loads of other rubbish and 

>       ... 26 more


I see this as a problem of solr error-reporting. This is not only obnoxiously "loud" (white on grey with oversized fonts), but less useful than it should be.
Instead of telling the user where the error occurred (i.e. while reading which file, which column at which line) it unravels the stack. This is useless if the program just choked on some unexpected input, like a typo in a schema of config file or an invalid character in a file to be indexed.
I don't know if this is due to the Tomcat, the logging system of solr itself, but it is annoying.

And yes, I've seen something like this before and found the error not by inspecting solr but by opening the suspected files with an appropriate browser (e.g. Firefox) which tells me exactly where something goes wrong.

All the best
Thomas


Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Markus Jelsma <ma...@openindex.io>.
Of course, i can't print the system bell and stuff like that in XML. I'll 
improve the method to get rid of non-printable control characters as well.

On Monday 27 June 2011 18:16:08 Mike Sokolov wrote:
> Markus - if you want to make sure not to offend XML parsers, you should
> strip all characters not in this list:
> 
> http://en.wikipedia.org/wiki/XML#Valid_characters
> 
> You'll see that article talks about XML 1.1, which accepts a wider range
> of characters than XML 1.0, and I believe the Woodstox parser used in
> Solr adheres to that convention.  But note the restriction about control
> characters needing to be encoded - I'm not sure, but it might also be
> best to strip out chars < 32 except for \r, \n and \t.  You definitely
> need to remove \0 also...
> 
> On 06/27/2011 11:59 AM, Markus Jelsma wrote:
> > Of course it doesn't work like this: use AND instead of OR!
> > 
> > On Monday 27 June 2011 17:50:01 Markus Jelsma wrote:
> >> Hi all, thanks for your comments. I seem to have fixed it by now by
> >> simply stripping away all non-character codepoints [1] by iterating
> >> over the individual chars and checking them against:
> >> 
> >> if (ch % 0x10000 != 0xffff || ch % 0x10000 != 0xfffe || (ch<= 0xfdd0&& 
> >> ch
> >> 
> >>> = 0xfdef)) { pass; }
> >> 
> >> Comments?
> >> 
> >> [1]: http://unicode.org/cldr/utility/list-
> >> unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
> >> 
> >> On Monday 27 June 2011 12:40:16 Markus Jelsma wrote:
> >>> Hi,
> >>> 
> >>> I came across the indexing error below. It happened in a huge batch
> >>> update from Nutch with SolrJ 3.1. Since the crawl was huge it is very
> >>> hard to trace the error back to a specific document. So i try my luck
> >>> here: anyone seen this before with SolrJ 3.1? Anything else on the
> >>> Nutch part i should have taken care off?
> >>> 
> >>> Thanks!
> >>> 
> >>> 
> >>> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> >>> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
> >>> status=500 QTime=423 Jun 27, 2011 10:24:28 AM
> >>> org.apache.solr.common.SolrException log
> >>> SEVERE: java.lang.RuntimeException: [was class
> >>> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> >>> #1142033, byte #1155068) at
> >>> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.jav
> >>> a: 1 8) at
> >>> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
> >>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.jav
> >>> a: 3 657) at
> >>> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
> >>> at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> >>> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> >>> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
> >>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Cont
> >>> en t StreamHandlerBase.java:67) at
> >>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandler
> >>> Ba s e.java:129) at
> >>> org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> >>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.j
> >>> a va
> >>> 
> >>> :356) at orJun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore
> >>> :execute
> >>> 
> >>> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
> >>> status=500 QTime=423 Jun 27, 2011 10:24:28 AM
> >>> org.apache.solr.common.SolrException log
> >>> SEVERE: java.lang.RuntimeException: [was class
> >>> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> >>> #1142033, byte #1155068) at
> >>> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.jav
> >>> a: 1 8) at
> >>> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
> >>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.jav
> >>> a: 3 657) at
> >>> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809)
> >>> at org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> >>> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> >>> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
> >>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Cont
> >>> en t StreamHandlerBase.java:67) at
> >>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandler
> >>> Ba s e.java:129) at
> >>> org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> >>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.j
> >>> a va
> >>> 
> >>> :356) at
> >>> 
> >>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.
> >>> ja v a:252) at
> >>> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHa
> >>> nd l er.java:1212) at
> >>> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399
> >>> ) at
> >>> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:
> >>> 21 6 ) at
> >>> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182
> >>> ) at
> >>> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766
> >>> ) at
> >>> org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> >>> at
> >>> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandle
> >>> rC o llection.java:230) at
> >>> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.ja
> >>> va
> >>> 
> >>> : 114) at
> >>> 
> >>> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152
> >>> ) at org.mortbay.jetty.Server.handle(Server.java:326)
> >>> 
> >>>          at
> >>> 
> >>> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> >>> at
> >>> org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.
> >>> j av a:945) at
> >>> org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843) at
> >>> org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
> >>> org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
> >>> org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.j
> >>> av a: 228) at
> >>> org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.jav
> >>> a: 5 82) Caused by: java.io.CharConversionException: Invalid UTF-8
> >>> character 0xffff at char #1142033, byte #1155068) at
> >>> com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
> >>> com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
> >>> 
> >>>          at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
> >>>          at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
> >>>          at
> >>> 
> >>> com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.ja
> >>> va
> >>> 
> >>> : 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
> >>> 
> >>> at
> >>> com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.j
> >>> a va
> >>> 
> >>> :4628) at
> >>> 
> >>> com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.j
> >>> av a
> >>> 
> >>> :4126) at
> >>> 
> >>> com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:37
> >>> 01 ) at
> >>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.jav
> >>> a: 3 649) ... 26
> >>> moreg.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilte
> >>> r. j ava:252) at
> >>> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHa
> >>> nd l er.java:1212) at
> >>> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399
> >>> ) at
> >>> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:
> >>> 21 6 ) at
> >>> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182
> >>> ) at
> >>> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766
> >>> ) at
> >>> org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> >>> at
> >>> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandle
> >>> rC o llection.java:230) at
> >>> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.ja
> >>> va
> >>> 
> >>> : 114) at
> >>> 
> >>> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152
> >>> ) at org.mortbay.jetty.Server.handle(Server.java:326)
> >>> 
> >>>          at
> >>> 
> >>> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> >>> at
> >>> org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.
> >>> j av a:945) at
> >>> org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843) at
> >>> org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
> >>> org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
> >>> org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.j
> >>> av a: 228) at
> >>> org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.jav
> >>> a: 5 82) Caused by: java.io.CharConversionException: Invalid UTF-8
> >>> character 0xffff at char #1142033, byte #1155068) at
> >>> com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
> >>> com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
> >>> 
> >>>          at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
> >>>          at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
> >>>          at
> >>> 
> >>> com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.ja
> >>> va
> >>> 
> >>> : 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
> >>> 
> >>> at
> >>> com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.j
> >>> a va
> >>> 
> >>> :4628) at
> >>> 
> >>> com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.j
> >>> av a
> >>> 
> >>> :4126) at
> >>> 
> >>> com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:37
> >>> 01 ) at
> >>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.jav
> >>> a: 3 649) ... 26 more

-- 
Markus Jelsma - CTO - Openindex
http://www.linkedin.com/in/markus17
050-8536620 / 06-50258350

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Mike Sokolov <so...@ifactory.com>.
Markus - if you want to make sure not to offend XML parsers, you should 
strip all characters not in this list:

http://en.wikipedia.org/wiki/XML#Valid_characters

You'll see that article talks about XML 1.1, which accepts a wider range 
of characters than XML 1.0, and I believe the Woodstox parser used in 
Solr adheres to that convention.  But note the restriction about control 
characters needing to be encoded - I'm not sure, but it might also be 
best to strip out chars < 32 except for \r, \n and \t.  You definitely 
need to remove \0 also...

On 06/27/2011 11:59 AM, Markus Jelsma wrote:
> Of course it doesn't work like this: use AND instead of OR!
>
> On Monday 27 June 2011 17:50:01 Markus Jelsma wrote:
>    
>> Hi all, thanks for your comments. I seem to have fixed it by now by simply
>> stripping away all non-character codepoints [1] by iterating over the
>> individual chars and checking them against:
>>
>> if (ch % 0x10000 != 0xffff || ch % 0x10000 != 0xfffe || (ch<= 0xfdd0&&  ch
>>      
>>> = 0xfdef)) { pass; }
>>>        
>> Comments?
>>
>> [1]: http://unicode.org/cldr/utility/list-
>> unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
>>
>> On Monday 27 June 2011 12:40:16 Markus Jelsma wrote:
>>      
>>> Hi,
>>>
>>> I came across the indexing error below. It happened in a huge batch
>>> update from Nutch with SolrJ 3.1. Since the crawl was huge it is very
>>> hard to trace the error back to a specific document. So i try my luck
>>> here: anyone seen this before with SolrJ 3.1? Anything else on the Nutch
>>> part i should have taken care off?
>>>
>>> Thanks!
>>>
>>>
>>> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
>>> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
>>> status=500 QTime=423 Jun 27, 2011 10:24:28 AM
>>> org.apache.solr.common.SolrException log
>>> SEVERE: java.lang.RuntimeException: [was class
>>> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
>>> #1142033, byte #1155068) at
>>> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:
>>> 1 8) at
>>> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
>>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
>>> 3 657) at
>>> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
>>> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
>>> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
>>> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
>>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Conten
>>> t StreamHandlerBase.java:67) at
>>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBa
>>> s e.java:129) at
>>> org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
>>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.ja
>>> va
>>>
>>> :356) at orJun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
>>>
>>> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
>>> status=500 QTime=423 Jun 27, 2011 10:24:28 AM
>>> org.apache.solr.common.SolrException log
>>> SEVERE: java.lang.RuntimeException: [was class
>>> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
>>> #1142033, byte #1155068) at
>>> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:
>>> 1 8) at
>>> com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
>>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
>>> 3 657) at
>>> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
>>> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
>>> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
>>> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
>>> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Conten
>>> t StreamHandlerBase.java:67) at
>>> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBa
>>> s e.java:129) at
>>> org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
>>> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.ja
>>> va
>>>
>>> :356) at
>>>
>>> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.ja
>>> v a:252) at
>>> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHand
>>> l er.java:1212) at
>>> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
>>> at
>>> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:21
>>> 6 ) at
>>> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
>>> at
>>> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
>>> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
>>> at
>>> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerC
>>> o llection.java:230) at
>>> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java
>>> : 114) at
>>> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
>>> at org.mortbay.jetty.Server.handle(Server.java:326)
>>>
>>>          at
>>>
>>> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
>>> at
>>> org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.j
>>> av a:945) at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
>>> at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
>>> org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
>>> org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.jav
>>> a: 228) at
>>> org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:
>>> 5 82) Caused by: java.io.CharConversionException: Invalid UTF-8 character
>>> 0xffff at char #1142033, byte #1155068) at
>>> com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
>>> com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
>>>
>>>          at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
>>>          at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
>>>          at
>>>
>>> com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java
>>> : 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
>>> at
>>> com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.ja
>>> va
>>>
>>> :4628) at
>>>
>>> com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.jav
>>> a
>>>
>>> :4126) at
>>>
>>> com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701
>>> ) at
>>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
>>> 3 649) ... 26
>>> moreg.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.
>>> j ava:252) at
>>> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHand
>>> l er.java:1212) at
>>> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
>>> at
>>> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:21
>>> 6 ) at
>>> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
>>> at
>>> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
>>> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
>>> at
>>> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerC
>>> o llection.java:230) at
>>> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java
>>> : 114) at
>>> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
>>> at org.mortbay.jetty.Server.handle(Server.java:326)
>>>
>>>          at
>>>
>>> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
>>> at
>>> org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.j
>>> av a:945) at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
>>> at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
>>> org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
>>> org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.jav
>>> a: 228) at
>>> org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:
>>> 5 82) Caused by: java.io.CharConversionException: Invalid UTF-8 character
>>> 0xffff at char #1142033, byte #1155068) at
>>> com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
>>> com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
>>>
>>>          at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
>>>          at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
>>>          at
>>>
>>> com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java
>>> : 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
>>> at
>>> com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.ja
>>> va
>>>
>>> :4628) at
>>>
>>> com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.jav
>>> a
>>>
>>> :4126) at
>>>
>>> com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701
>>> ) at
>>> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
>>> 3 649) ... 26 more
>>>        
>    

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Markus Jelsma <ma...@openindex.io>.
Of course it doesn't work like this: use AND instead of OR!

On Monday 27 June 2011 17:50:01 Markus Jelsma wrote:
> Hi all, thanks for your comments. I seem to have fixed it by now by simply
> stripping away all non-character codepoints [1] by iterating over the
> individual chars and checking them against:
> 
> if (ch % 0x10000 != 0xffff || ch % 0x10000 != 0xfffe || (ch <= 0xfdd0 && ch
> >= 0xfdef)) { pass; }
> 
> Comments?
> 
> [1]: http://unicode.org/cldr/utility/list-
> unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]
> 
> On Monday 27 June 2011 12:40:16 Markus Jelsma wrote:
> > Hi,
> > 
> > I came across the indexing error below. It happened in a huge batch
> > update from Nutch with SolrJ 3.1. Since the crawl was huge it is very
> > hard to trace the error back to a specific document. So i try my luck
> > here: anyone seen this before with SolrJ 3.1? Anything else on the Nutch
> > part i should have taken care off?
> > 
> > Thanks!
> > 
> > 
> > Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> > INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
> > status=500 QTime=423 Jun 27, 2011 10:24:28 AM
> > org.apache.solr.common.SolrException log
> > SEVERE: java.lang.RuntimeException: [was class
> > java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> > #1142033, byte #1155068) at
> > com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:
> > 1 8) at
> > com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
> > com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
> > 3 657) at
> > com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> > org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> > org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> > org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
> > org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Conten
> > t StreamHandlerBase.java:67) at
> > org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBa
> > s e.java:129) at
> > org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> > org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.ja
> > va
> > 
> > :356) at orJun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> > 
> > INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
> > status=500 QTime=423 Jun 27, 2011 10:24:28 AM
> > org.apache.solr.common.SolrException log
> > SEVERE: java.lang.RuntimeException: [was class
> > java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> > #1142033, byte #1155068) at
> > com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:
> > 1 8) at
> > com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731) at
> > com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
> > 3 657) at
> > com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> > org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> > org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> > org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
> > org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Conten
> > t StreamHandlerBase.java:67) at
> > org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBa
> > s e.java:129) at
> > org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> > org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.ja
> > va
> > 
> > :356) at
> > 
> > org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.ja
> > v a:252) at
> > org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHand
> > l er.java:1212) at
> > org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> > at
> > org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:21
> > 6 ) at
> > org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> > at
> > org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> > at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> > at
> > org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerC
> > o llection.java:230) at
> > org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java
> > : 114) at
> > org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> > at org.mortbay.jetty.Server.handle(Server.java:326)
> > 
> >         at
> > 
> > org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> > at
> > org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.j
> > av a:945) at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
> > at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
> > org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
> > org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.jav
> > a: 228) at
> > org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:
> > 5 82) Caused by: java.io.CharConversionException: Invalid UTF-8 character
> > 0xffff at char #1142033, byte #1155068) at
> > com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
> > com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
> > 
> >         at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
> >         at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
> >         at
> > 
> > com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java
> > : 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
> > at
> > com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.ja
> > va
> > 
> > :4628) at
> > 
> > com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.jav
> > a
> > 
> > :4126) at
> > 
> > com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701
> > ) at
> > com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
> > 3 649) ... 26
> > moreg.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.
> > j ava:252) at
> > org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHand
> > l er.java:1212) at
> > org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> > at
> > org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:21
> > 6 ) at
> > org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> > at
> > org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> > at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> > at
> > org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerC
> > o llection.java:230) at
> > org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java
> > : 114) at
> > org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> > at org.mortbay.jetty.Server.handle(Server.java:326)
> > 
> >         at
> > 
> > org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542)
> > at
> > org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.j
> > av a:945) at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843)
> > at org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
> > org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
> > org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.jav
> > a: 228) at
> > org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:
> > 5 82) Caused by: java.io.CharConversionException: Invalid UTF-8 character
> > 0xffff at char #1142033, byte #1155068) at
> > com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
> > com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
> > 
> >         at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
> >         at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
> >         at
> > 
> > com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java
> > : 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992)
> > at
> > com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.ja
> > va
> > 
> > :4628) at
> > 
> > com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.jav
> > a
> > 
> > :4126) at
> > 
> > com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701
> > ) at
> > com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:
> > 3 649) ... 26 more

-- 
Markus Jelsma - CTO - Openindex
http://www.linkedin.com/in/markus17
050-8536620 / 06-50258350

Re: Solr 3.1 indexing error Invalid UTF-8 character 0xffff

Posted by Markus Jelsma <ma...@openindex.io>.
Hi all, thanks for your comments. I seem to have fixed it by now by simply 
stripping away all non-character codepoints [1] by iterating over the 
individual chars and checking them against:

if (ch % 0x10000 != 0xffff || ch % 0x10000 != 0xfffe || (ch <= 0xfdd0 && ch >= 
0xfdef)) { pass; }

Comments? 

[1]: http://unicode.org/cldr/utility/list-
unicodeset.jsp?a=[:Noncharacter_Code_Point=True:]

On Monday 27 June 2011 12:40:16 Markus Jelsma wrote:
> Hi,
> 
> I came across the indexing error below. It happened in a huge batch update
> from Nutch with SolrJ 3.1. Since the crawl was huge it is very hard to
> trace the error back to a specific document. So i try my luck here: anyone
> seen this before with SolrJ 3.1? Anything else on the Nutch part i should
> have taken care off?
> 
> Thanks!
> 
> 
> Jun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2} status=500
> QTime=423 Jun 27, 2011 10:24:28 AM org.apache.solr.common.SolrException
> log
> SEVERE: java.lang.RuntimeException: [was class
> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> #1142033, byte #1155068) at
> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:1
> 8) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
> at
> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
> 657) at
> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Content
> StreamHandlerBase.java:67) at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBas
> e.java:129) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java
> :356) at orJun 27, 2011 10:24:28 AM org.apache.solr.core.SolrCore execute
> INFO: [] webapp=/solr path=/update params={wt=javabin&version=2}
> status=500 QTime=423 Jun 27, 2011 10:24:28 AM
> org.apache.solr.common.SolrException log
> SEVERE: java.lang.RuntimeException: [was class
> java.io.CharConversionException] Invalid UTF-8 character 0xffff at char
> #1142033, byte #1155068) at
> com.ctc.wstx.util.ExceptionUtil.throwRuntimeException(ExceptionUtil.java:1
> 8) at com.ctc.wstx.sr.StreamScanner.throwLazyError(StreamScanner.java:731)
> at
> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
> 657) at
> com.ctc.wstx.sr.BasicStreamReader.getText(BasicStreamReader.java:809) at
> org.apache.solr.handler.XMLLoader.readDoc(XMLLoader.java:287) at
> org.apache.solr.handler.XMLLoader.processUpdate(XMLLoader.java:146) at
> org.apache.solr.handler.XMLLoader.load(XMLLoader.java:77) at
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(Content
> StreamHandlerBase.java:67) at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBas
> e.java:129) at org.apache.solr.core.SolrCore.execute(SolrCore.java:1368) at
> org.apache.solr.servlet.SolrDispatchFilter.execute(SolrDispatchFilter.java
> :356) at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.jav
> a:252) at
> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandl
> er.java:1212) at
> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> at
> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216
> ) at
> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> at
> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> at
> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCo
> llection.java:230) at
> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:
> 114) at
> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> at org.mortbay.jetty.Server.handle(Server.java:326)
>         at
> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542) at
> org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.jav
> a:945) at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843) at
> org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
> org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
> org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:
> 228) at
> org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:5
> 82) Caused by: java.io.CharConversionException: Invalid UTF-8 character
> 0xffff at char #1142033, byte #1155068) at
> com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
> com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
>         at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
>         at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
>         at
> com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:
> 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992) at
> com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java
> :4628) at
> com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java
> :4126) at
> com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
> at
> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
> 649) ... 26
> moreg.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.j
> ava:252) at
> org.mortbay.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandl
> er.java:1212) at
> org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:399)
> at
> org.mortbay.jetty.security.SecurityHandler.handle(SecurityHandler.java:216
> ) at
> org.mortbay.jetty.servlet.SessionHandler.handle(SessionHandler.java:182)
> at
> org.mortbay.jetty.handler.ContextHandler.handle(ContextHandler.java:766)
> at org.mortbay.jetty.webapp.WebAppContext.handle(WebAppContext.java:450)
> at
> org.mortbay.jetty.handler.ContextHandlerCollection.handle(ContextHandlerCo
> llection.java:230) at
> org.mortbay.jetty.handler.HandlerCollection.handle(HandlerCollection.java:
> 114) at
> org.mortbay.jetty.handler.HandlerWrapper.handle(HandlerWrapper.java:152)
> at org.mortbay.jetty.Server.handle(Server.java:326)
>         at
> org.mortbay.jetty.HttpConnection.handleRequest(HttpConnection.java:542) at
> org.mortbay.jetty.HttpConnection$RequestHandler.content(HttpConnection.jav
> a:945) at org.mortbay.jetty.HttpParser.parseNext(HttpParser.java:843) at
> org.mortbay.jetty.HttpParser.parseAvailable(HttpParser.java:218) at
> org.mortbay.jetty.HttpConnection.handle(HttpConnection.java:404) at
> org.mortbay.jetty.bio.SocketConnector$Connection.run(SocketConnector.java:
> 228) at
> org.mortbay.thread.QueuedThreadPool$PoolThread.run(QueuedThreadPool.java:5
> 82) Caused by: java.io.CharConversionException: Invalid UTF-8 character
> 0xffff at char #1142033, byte #1155068) at
> com.ctc.wstx.io.UTF8Reader.reportInvalid(UTF8Reader.java:335) at
> com.ctc.wstx.io.UTF8Reader.read(UTF8Reader.java:249)
>         at com.ctc.wstx.io.MergedReader.read(MergedReader.java:101)
>         at com.ctc.wstx.io.ReaderSource.readInto(ReaderSource.java:84)
>         at
> com.ctc.wstx.io.BranchingReaderSource.readInto(BranchingReaderSource.java:
> 57) at com.ctc.wstx.sr.StreamScanner.loadMore(StreamScanner.java:992) at
> com.ctc.wstx.sr.BasicStreamReader.readTextSecondary(BasicStreamReader.java
> :4628) at
> com.ctc.wstx.sr.BasicStreamReader.readCoalescedText(BasicStreamReader.java
> :4126) at
> com.ctc.wstx.sr.BasicStreamReader.finishToken(BasicStreamReader.java:3701)
> at
> com.ctc.wstx.sr.BasicStreamReader.safeFinishToken(BasicStreamReader.java:3
> 649) ... 26 more

-- 
Markus Jelsma - CTO - Openindex
http://www.linkedin.com/in/markus17
050-8536620 / 06-50258350