You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Brian Whitman <br...@variogr.am> on 2007/05/07 17:05:22 UTC

UTF-8 problem with Resin

Using resin 3.0.23 with a trunk solr war I am having a problem adding  
documents with utf-8 characters, including the utf8-example in  
exampledocs.

The document simply doesn't get added to Solr. Flat ascii documents  
work fine as does all non-update stuff.

To reproduce:
install resin 3 and set up solr according to the wiki for resin.
./post.sh utf8-example.xml

I also have a real world document that doesn't work (from our nutch  
crawls):
wget http://variogr.am/badfile.txt
./post.sh badfile.txt

I get this in my resin logs.

[10:53:10.834] java.io.CharConversionException: illegal utf8 encoding  
at 0xc3, a
[10:53:10.834]  at com.caucho.vfs.i18n.UTF8Reader.read 
(UTF8Reader.java:97)
[10:53:10.834]  at com.caucho.vfs.i18n.UTF8Reader.read 
(UTF8Reader.java:178)
[10:53:10.834]  at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
[10:53:10.834]  at com.caucho.vfs.BufferedReaderAdapter.read 
(BufferedReaderAdapter.java:64)
[10:53:10.834]  at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:2972)
[10:53:10.834]  at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
[10:53:10.834]  at org.xmlpull.mxp1.MXParser.parseProlog 
(MXParser.java:1410)
[10:53:10.834]  at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java: 
1395)
[10:53:10.834]  at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
[10:53:10.834]  at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:1078)
[10:53:10.834]  at  
org.apache.solr.handler.XmlUpdateRequestHandler.update 
(XmlUpdateRequestHandler.java:111)
[10:53:10.834]  at  
org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody 
(XmlUpdateRequestHandler.java:84)
[10:53:10.834]  at  
org.apache.solr.handler.RequestHandlerBase.handleRequest 
(RequestHandlerBase.java:77)
[10:53:10.834]  at org.apache.solr.core.SolrCore.execute 
(SolrCore.java:671)
[10:53:10.834]  at org.apache.solr.servlet.SolrDispatchFilter.execute 
(SolrDispatchFilter.java:188)
[10:53:10.834]  at org.apache.solr.servlet.SolrDispatchFilter.doFilter 
(SolrDispatchFilter.java:156)
[10:53:10.834]  at  
com.caucho.server.dispatch.FilterFilterChain.doFilter 
(FilterFilterChain.java:70)
[10:53:10.834]  at com.caucho.server.webapp.WebAppFilterChain.doFilter 
(WebAppFilterChain.java:173)
[10:53:10.834]  at  
com.caucho.server.dispatch.ServletInvocation.service 
(ServletInvocation.java:229)
[10:53:10.834]  at com.caucho.server.http.HttpRequest.handleRequest 
(HttpRequest.java:274)
[10:53:10.834]  at com.caucho.server.port.TcpConnection.run 
(TcpConnection.java:511)
[10:53:10.834]  at com.caucho.util.ThreadPool.runTasks 
(ThreadPool.java:520)
[10:53:10.834]  at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
[10:53:10.834]  at java.lang.Thread.run(Thread.java:619)
[10:53:10.834]
[10:53:10.835] /update  0 2
[10:53:10.836] java.io.CharConversionException: illegal utf8 encoding  
at 0xc3, a
[10:53:10.836]  at com.caucho.vfs.i18n.UTF8Reader.read 
(UTF8Reader.java:97)
[10:53:10.836]  at com.caucho.vfs.i18n.UTF8Reader.read 
(UTF8Reader.java:178)
[10:53:10.836]  at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
[10:53:10.836]  at com.caucho.vfs.BufferedReaderAdapter.read 
(BufferedReaderAdapter.java:64)
[10:53:10.836]  at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:2972)
[10:53:10.836]  at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
[10:53:10.836]  at org.xmlpull.mxp1.MXParser.parseProlog 
(MXParser.java:1410)
[10:53:10.836]  at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java: 
1395)
[10:53:10.836]  at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
[10:53:10.836]  at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:1078)
[10:53:10.836]  at  
org.apache.solr.handler.XmlUpdateRequestHandler.update 
(XmlUpdateRequestHandler.java:111)
[10:53:10.836]  at  
org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody 
(XmlUpdateRequestHandler.java:84)
[10:53:10.836]  at  
org.apache.solr.handler.RequestHandlerBase.handleRequest 
(RequestHandlerBase.java:77)
[10:53:10.836]  at org.apache.solr.core.SolrCore.execute 
(SolrCore.java:671)
[10:53:10.836]  at org.apache.solr.servlet.SolrDispatchFilter.execute 
(SolrDispatchFilter.java:188)
[10:53:10.836]  at org.apache.solr.servlet.SolrDispatchFilter.doFilter 
(SolrDispatchFilter.java:156)
[10:53:10.836]  at  
com.caucho.server.dispatch.FilterFilterChain.doFilter 
(FilterFilterChain.java:70)
[10:53:10.836]  at com.caucho.server.webapp.WebAppFilterChain.doFilter 
(WebAppFilterChain.java:173)
[10:53:10.836]  at  
com.caucho.server.dispatch.ServletInvocation.service 
(ServletInvocation.java:229)
[10:53:10.836]  at com.caucho.server.http.HttpRequest.handleRequest 
(HttpRequest.java:274)
[10:53:10.836]  at com.caucho.server.port.TcpConnection.run 
(TcpConnection.java:511)
[10:53:10.836]  at com.caucho.util.ThreadPool.runTasks 
(ThreadPool.java:520)
[10:53:10.836]  at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
[10:53:10.836]  at java.lang.Thread.run(Thread.java:619)
[10:53:10.836]
[10:53:10.837] [2] HTTP/1.1 500 illegal utf8 encoding at 0xc3, a
[10:53:10.837]
[10:53:10.837] java.io.CharConversionException: illegal utf8 encoding  
at 0xc3, a
[10:53:10.837]  at com.caucho.vfs.i18n.UTF8Reader.read 
(UTF8Reader.java:97)
[10:53:10.837]  at com.caucho.vfs.i18n.UTF8Reader.read 
(UTF8Reader.java:178)
[10:53:10.837]  at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
[10:53:10.837]  at com.caucho.vfs.BufferedReaderAdapter.read 
(BufferedReaderAdapter.java:64)
[10:53:10.837]  at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:2972)
[10:53:10.837]  at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
[10:53:10.837]  at org.xmlpull.mxp1.MXParser.parseProlog 
(MXParser.java:1410)
[10:53:10.837]  at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java: 
1395)
[10:53:10.837]  at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
[10:53:10.837]  at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:1078)
[10:53:10.837]  at  
org.apache.solr.handler.XmlUpdateRequestHandler.update 
(XmlUpdateRequestHandler.java:111)
[10:53:10.837]  at  
org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody 
(XmlUpdateRequestHandler.java:84)
[10:53:10.837]  at  
org.apache.solr.handler.RequestHandlerBase.handleRequest 
(RequestHandlerBase.java:77)
[10:53:10.837]  at org.apache.solr.core.SolrCore.execute 
(SolrCore.java:671)
[10:53:10.837]  at org.apache.solr.servlet.SolrDispatchFilter.execute 
(SolrDispatchFilter.java:188)
[10:53:10.837]  at org.apache.solr.servlet.SolrDispatchFilter.doFilter 
(SolrDispatchFilter.java:156)
[10:53:10.837]  at  
com.caucho.server.dispatch.FilterFilterChain.doFilter 
(FilterFilterChain.java:70)
[10:53:10.837]  at com.caucho.server.webapp.WebAppFilterChain.doFilter 
(WebAppFilterChain.java:173)
[10:53:10.837]  at  
com.caucho.server.dispatch.ServletInvocation.service 
(ServletInvocation.java:229)
[10:53:10.837]  at com.caucho.server.http.HttpRequest.handleRequest 
(HttpRequest.java:274)
[10:53:10.837]  at com.caucho.server.port.TcpConnection.run 
(TcpConnection.java:511)
[10:53:10.837]  at com.caucho.util.ThreadPool.runTasks 
(ThreadPool.java:520)
[10:53:10.837]  at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
[10:53:10.837]  at java.lang.Thread.run(Thread.java:619)
[10:53:10.837]



--
http://variogr.am/
brian.whitman@variogr.am




Re: UTF-8 problem with Resin

Posted by Ken Krugler <kk...@transpac.com>.
>>>I also have a real world document that doesn't work (from our nutch crawls):
>>>wget http://variogr.am/badfile.txt
>>>./post.sh badfile.txt
>>
>>
>>A solr rock star advised me to try SOLR-214, which fixes the 
>>problem. Perhaps he'll illuminate us as to the reasons! But for now 
>>be careful with Resin.
>>
>
>I don't know about this "rock star" business!
>
>Brian's setup worked running trunk from ~1 month ago...  the major 
>character encoding change since then is to use the servlet 
>container's getReader() rather then construct it from the stream.
>
>The javadocs are clear that the servlet container needs to handle 
>the conversion...  but if that is causing problems in the newest 
>resin and tomcat, maybe solr should take care of it.

 From my experience with Resin, you definitely want to be as explicit 
as you can about the character encoding.

-- Ken
-- 
Ken Krugler
Krugle, Inc.
+1 530-210-6378
"Find Code, Find Answers"

Re: UTF-8 problem with Resin

Posted by Koji Sekiguchi <ko...@r.email.ne.jp>.
The problem was gone. Thank you very much for the handling.

Koji

Ryan McKinley wrote:
> sorry.  I tested with something that did not duplicate the problem.
>
> update and try rev 536048.
>
>
> Koji Sekiguchi wrote:
>> Ryan,
>>
>> Thank you for committing SOLR-214, but we are still facing the 
>> garbled characters problem
>> under Tomcat 5.5.23.
>>
>> I checked the patch, but unfortunately, ContentStreamBase.getReader() 
>> works correctly
>> when using stream.* parameters. Without stream.* parameters, 
>> contentType is null and
>> ContentStreamBase.getReader() uses system default encoding.
>>
>> Could you check it please?
>>
>> Best regards,
>>
>> Koji
>>
>>
>>
>
>


Re: UTF-8 problem with Resin

Posted by Ryan McKinley <ry...@gmail.com>.
sorry.  I tested with something that did not duplicate the problem.

update and try rev 536048.


Koji Sekiguchi wrote:
> Ryan,
> 
> Thank you for committing SOLR-214, but we are still facing the garbled 
> characters problem
> under Tomcat 5.5.23.
> 
> I checked the patch, but unfortunately, ContentStreamBase.getReader() 
> works correctly
> when using stream.* parameters. Without stream.* parameters, contentType 
> is null and
> ContentStreamBase.getReader() uses system default encoding.
> 
> Could you check it please?
> 
> Best regards,
> 
> Koji
> 
> 
> 


Re: UTF-8 problem with Resin

Posted by Koji Sekiguchi <ko...@r.email.ne.jp>.
Ryan,

Thank you for committing SOLR-214, but we are still facing the garbled 
characters problem
under Tomcat 5.5.23.

I checked the patch, but unfortunately, ContentStreamBase.getReader() 
works correctly
when using stream.* parameters. Without stream.* parameters, contentType 
is null and
ContentStreamBase.getReader() uses system default encoding.

Could you check it please?

Best regards,

Koji



Re: UTF-8 problem with Resin

Posted by Ryan McKinley <ry...@gmail.com>.
>>
>> I also have a real world document that doesn't work (from our nutch 
>> crawls):
>> wget http://variogr.am/badfile.txt
>> ./post.sh badfile.txt
> 
> 
> A solr rock star advised me to try SOLR-214, which fixes the problem. 
> Perhaps he'll illuminate us as to the reasons! But for now be careful 
> with Resin.
> 
> 

I don't know about this "rock star" business!

Brian's setup worked running trunk from ~1 month ago...  the major 
character encoding change since then is to use the servlet container's 
getReader() rather then construct it from the stream.

The javadocs are clear that the servlet container needs to handle the 
conversion...  but if that is causing problems in the newest resin and 
tomcat, maybe solr should take care of it.



Re: UTF-8 problem with Resin

Posted by Brian Whitman <br...@variogr.am>.
On May 7, 2007, at 11:05 AM, Brian Whitman wrote:
> Using resin 3.0.23 with a trunk solr war I am having a problem  
> adding documents with utf-8 characters, including the utf8-example  
> in exampledocs.
>
> The document simply doesn't get added to Solr. Flat ascii documents  
> work fine as does all non-update stuff.
>
> To reproduce:
> install resin 3 and set up solr according to the wiki for resin.
> ./post.sh utf8-example.xml
>
> I also have a real world document that doesn't work (from our nutch  
> crawls):
> wget http://variogr.am/badfile.txt
> ./post.sh badfile.txt


A solr rock star advised me to try SOLR-214, which fixes the problem.  
Perhaps he'll illuminate us as to the reasons! But for now be careful  
with Resin.


-Brian







> I get this in my resin logs.
>
> [10:53:10.834] java.io.CharConversionException: illegal utf8  
> encoding at 0xc3, a
> [10:53:10.834]  at com.caucho.vfs.i18n.UTF8Reader.read 
> (UTF8Reader.java:97)
> [10:53:10.834]  at com.caucho.vfs.i18n.UTF8Reader.read 
> (UTF8Reader.java:178)
> [10:53:10.834]  at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
> [10:53:10.834]  at com.caucho.vfs.BufferedReaderAdapter.read 
> (BufferedReaderAdapter.java:64)
> [10:53:10.834]  at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java: 
> 2972)
> [10:53:10.834]  at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
> [10:53:10.834]  at org.xmlpull.mxp1.MXParser.parseProlog 
> (MXParser.java:1410)
> [10:53:10.834]  at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java: 
> 1395)
> [10:53:10.834]  at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
> [10:53:10.834]  at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java: 
> 1078)
> [10:53:10.834]  at  
> org.apache.solr.handler.XmlUpdateRequestHandler.update 
> (XmlUpdateRequestHandler.java:111)
> [10:53:10.834]  at  
> org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody 
> (XmlUpdateRequestHandler.java:84)
> [10:53:10.834]  at  
> org.apache.solr.handler.RequestHandlerBase.handleRequest 
> (RequestHandlerBase.java:77)
> [10:53:10.834]  at org.apache.solr.core.SolrCore.execute 
> (SolrCore.java:671)
> [10:53:10.834]  at  
> org.apache.solr.servlet.SolrDispatchFilter.execute 
> (SolrDispatchFilter.java:188)
> [10:53:10.834]  at  
> org.apache.solr.servlet.SolrDispatchFilter.doFilter 
> (SolrDispatchFilter.java:156)
> [10:53:10.834]  at  
> com.caucho.server.dispatch.FilterFilterChain.doFilter 
> (FilterFilterChain.java:70)
> [10:53:10.834]  at  
> com.caucho.server.webapp.WebAppFilterChain.doFilter 
> (WebAppFilterChain.java:173)
> [10:53:10.834]  at  
> com.caucho.server.dispatch.ServletInvocation.service 
> (ServletInvocation.java:229)
> [10:53:10.834]  at com.caucho.server.http.HttpRequest.handleRequest 
> (HttpRequest.java:274)
> [10:53:10.834]  at com.caucho.server.port.TcpConnection.run 
> (TcpConnection.java:511)
> [10:53:10.834]  at com.caucho.util.ThreadPool.runTasks 
> (ThreadPool.java:520)
> [10:53:10.834]  at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
> [10:53:10.834]  at java.lang.Thread.run(Thread.java:619)
> [10:53:10.834]
> [10:53:10.835] /update  0 2
> [10:53:10.836] java.io.CharConversionException: illegal utf8  
> encoding at 0xc3, a
> [10:53:10.836]  at com.caucho.vfs.i18n.UTF8Reader.read 
> (UTF8Reader.java:97)
> [10:53:10.836]  at com.caucho.vfs.i18n.UTF8Reader.read 
> (UTF8Reader.java:178)
> [10:53:10.836]  at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
> [10:53:10.836]  at com.caucho.vfs.BufferedReaderAdapter.read 
> (BufferedReaderAdapter.java:64)
> [10:53:10.836]  at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java: 
> 2972)
> [10:53:10.836]  at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
> [10:53:10.836]  at org.xmlpull.mxp1.MXParser.parseProlog 
> (MXParser.java:1410)
> [10:53:10.836]  at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java: 
> 1395)
> [10:53:10.836]  at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
> [10:53:10.836]  at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java: 
> 1078)
> [10:53:10.836]  at  
> org.apache.solr.handler.XmlUpdateRequestHandler.update 
> (XmlUpdateRequestHandler.java:111)
> [10:53:10.836]  at  
> org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody 
> (XmlUpdateRequestHandler.java:84)
> [10:53:10.836]  at  
> org.apache.solr.handler.RequestHandlerBase.handleRequest 
> (RequestHandlerBase.java:77)
> [10:53:10.836]  at org.apache.solr.core.SolrCore.execute 
> (SolrCore.java:671)
> [10:53:10.836]  at  
> org.apache.solr.servlet.SolrDispatchFilter.execute 
> (SolrDispatchFilter.java:188)
> [10:53:10.836]  at  
> org.apache.solr.servlet.SolrDispatchFilter.doFilter 
> (SolrDispatchFilter.java:156)
> [10:53:10.836]  at  
> com.caucho.server.dispatch.FilterFilterChain.doFilter 
> (FilterFilterChain.java:70)
> [10:53:10.836]  at  
> com.caucho.server.webapp.WebAppFilterChain.doFilter 
> (WebAppFilterChain.java:173)
> [10:53:10.836]  at  
> com.caucho.server.dispatch.ServletInvocation.service 
> (ServletInvocation.java:229)
> [10:53:10.836]  at com.caucho.server.http.HttpRequest.handleRequest 
> (HttpRequest.java:274)
> [10:53:10.836]  at com.caucho.server.port.TcpConnection.run 
> (TcpConnection.java:511)
> [10:53:10.836]  at com.caucho.util.ThreadPool.runTasks 
> (ThreadPool.java:520)
> [10:53:10.836]  at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
> [10:53:10.836]  at java.lang.Thread.run(Thread.java:619)
> [10:53:10.836]
> [10:53:10.837] [2] HTTP/1.1 500 illegal utf8 encoding at 0xc3, a
> [10:53:10.837]
> [10:53:10.837] java.io.CharConversionException: illegal utf8  
> encoding at 0xc3, a
> [10:53:10.837]  at com.caucho.vfs.i18n.UTF8Reader.read 
> (UTF8Reader.java:97)
> [10:53:10.837]  at com.caucho.vfs.i18n.UTF8Reader.read 
> (UTF8Reader.java:178)
> [10:53:10.837]  at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
> [10:53:10.837]  at com.caucho.vfs.BufferedReaderAdapter.read 
> (BufferedReaderAdapter.java:64)
> [10:53:10.837]  at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java: 
> 2972)
> [10:53:10.837]  at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
> [10:53:10.837]  at org.xmlpull.mxp1.MXParser.parseProlog 
> (MXParser.java:1410)
> [10:53:10.837]  at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java: 
> 1395)
> [10:53:10.837]  at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
> [10:53:10.837]  at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java: 
> 1078)
> [10:53:10.837]  at  
> org.apache.solr.handler.XmlUpdateRequestHandler.update 
> (XmlUpdateRequestHandler.java:111)
> [10:53:10.837]  at  
> org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody 
> (XmlUpdateRequestHandler.java:84)
> [10:53:10.837]  at  
> org.apache.solr.handler.RequestHandlerBase.handleRequest 
> (RequestHandlerBase.java:77)
> [10:53:10.837]  at org.apache.solr.core.SolrCore.execute 
> (SolrCore.java:671)
> [10:53:10.837]  at  
> org.apache.solr.servlet.SolrDispatchFilter.execute 
> (SolrDispatchFilter.java:188)
> [10:53:10.837]  at  
> org.apache.solr.servlet.SolrDispatchFilter.doFilter 
> (SolrDispatchFilter.java:156)
> [10:53:10.837]  at  
> com.caucho.server.dispatch.FilterFilterChain.doFilter 
> (FilterFilterChain.java:70)
> [10:53:10.837]  at  
> com.caucho.server.webapp.WebAppFilterChain.doFilter 
> (WebAppFilterChain.java:173)
> [10:53:10.837]  at  
> com.caucho.server.dispatch.ServletInvocation.service 
> (ServletInvocation.java:229)
> [10:53:10.837]  at com.caucho.server.http.HttpRequest.handleRequest 
> (HttpRequest.java:274)
> [10:53:10.837]  at com.caucho.server.port.TcpConnection.run 
> (TcpConnection.java:511)
> [10:53:10.837]  at com.caucho.util.ThreadPool.runTasks 
> (ThreadPool.java:520)
> [10:53:10.837]  at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
> [10:53:10.837]  at java.lang.Thread.run(Thread.java:619)
> [10:53:10.837]
>
>
>
> --
> http://variogr.am/
> brian.whitman@variogr.am
>
>
>