You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-user@lucene.apache.org by Brian Whitman <br...@variogr.am> on 2007/05/07 17:05:22 UTC
UTF-8 problem with Resin
Using resin 3.0.23 with a trunk solr war I am having a problem adding
documents with utf-8 characters, including the utf8-example in
exampledocs.
The document simply doesn't get added to Solr. Flat ascii documents
work fine as does all non-update stuff.
To reproduce:
install resin 3 and set up solr according to the wiki for resin.
./post.sh utf8-example.xml
I also have a real world document that doesn't work (from our nutch
crawls):
wget http://variogr.am/badfile.txt
./post.sh badfile.txt
I get this in my resin logs.
[10:53:10.834] java.io.CharConversionException: illegal utf8 encoding
at 0xc3, a
[10:53:10.834] at com.caucho.vfs.i18n.UTF8Reader.read
(UTF8Reader.java:97)
[10:53:10.834] at com.caucho.vfs.i18n.UTF8Reader.read
(UTF8Reader.java:178)
[10:53:10.834] at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
[10:53:10.834] at com.caucho.vfs.BufferedReaderAdapter.read
(BufferedReaderAdapter.java:64)
[10:53:10.834] at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:2972)
[10:53:10.834] at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
[10:53:10.834] at org.xmlpull.mxp1.MXParser.parseProlog
(MXParser.java:1410)
[10:53:10.834] at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java:
1395)
[10:53:10.834] at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
[10:53:10.834] at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:1078)
[10:53:10.834] at
org.apache.solr.handler.XmlUpdateRequestHandler.update
(XmlUpdateRequestHandler.java:111)
[10:53:10.834] at
org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody
(XmlUpdateRequestHandler.java:84)
[10:53:10.834] at
org.apache.solr.handler.RequestHandlerBase.handleRequest
(RequestHandlerBase.java:77)
[10:53:10.834] at org.apache.solr.core.SolrCore.execute
(SolrCore.java:671)
[10:53:10.834] at org.apache.solr.servlet.SolrDispatchFilter.execute
(SolrDispatchFilter.java:188)
[10:53:10.834] at org.apache.solr.servlet.SolrDispatchFilter.doFilter
(SolrDispatchFilter.java:156)
[10:53:10.834] at
com.caucho.server.dispatch.FilterFilterChain.doFilter
(FilterFilterChain.java:70)
[10:53:10.834] at com.caucho.server.webapp.WebAppFilterChain.doFilter
(WebAppFilterChain.java:173)
[10:53:10.834] at
com.caucho.server.dispatch.ServletInvocation.service
(ServletInvocation.java:229)
[10:53:10.834] at com.caucho.server.http.HttpRequest.handleRequest
(HttpRequest.java:274)
[10:53:10.834] at com.caucho.server.port.TcpConnection.run
(TcpConnection.java:511)
[10:53:10.834] at com.caucho.util.ThreadPool.runTasks
(ThreadPool.java:520)
[10:53:10.834] at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
[10:53:10.834] at java.lang.Thread.run(Thread.java:619)
[10:53:10.834]
[10:53:10.835] /update 0 2
[10:53:10.836] java.io.CharConversionException: illegal utf8 encoding
at 0xc3, a
[10:53:10.836] at com.caucho.vfs.i18n.UTF8Reader.read
(UTF8Reader.java:97)
[10:53:10.836] at com.caucho.vfs.i18n.UTF8Reader.read
(UTF8Reader.java:178)
[10:53:10.836] at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
[10:53:10.836] at com.caucho.vfs.BufferedReaderAdapter.read
(BufferedReaderAdapter.java:64)
[10:53:10.836] at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:2972)
[10:53:10.836] at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
[10:53:10.836] at org.xmlpull.mxp1.MXParser.parseProlog
(MXParser.java:1410)
[10:53:10.836] at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java:
1395)
[10:53:10.836] at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
[10:53:10.836] at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:1078)
[10:53:10.836] at
org.apache.solr.handler.XmlUpdateRequestHandler.update
(XmlUpdateRequestHandler.java:111)
[10:53:10.836] at
org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody
(XmlUpdateRequestHandler.java:84)
[10:53:10.836] at
org.apache.solr.handler.RequestHandlerBase.handleRequest
(RequestHandlerBase.java:77)
[10:53:10.836] at org.apache.solr.core.SolrCore.execute
(SolrCore.java:671)
[10:53:10.836] at org.apache.solr.servlet.SolrDispatchFilter.execute
(SolrDispatchFilter.java:188)
[10:53:10.836] at org.apache.solr.servlet.SolrDispatchFilter.doFilter
(SolrDispatchFilter.java:156)
[10:53:10.836] at
com.caucho.server.dispatch.FilterFilterChain.doFilter
(FilterFilterChain.java:70)
[10:53:10.836] at com.caucho.server.webapp.WebAppFilterChain.doFilter
(WebAppFilterChain.java:173)
[10:53:10.836] at
com.caucho.server.dispatch.ServletInvocation.service
(ServletInvocation.java:229)
[10:53:10.836] at com.caucho.server.http.HttpRequest.handleRequest
(HttpRequest.java:274)
[10:53:10.836] at com.caucho.server.port.TcpConnection.run
(TcpConnection.java:511)
[10:53:10.836] at com.caucho.util.ThreadPool.runTasks
(ThreadPool.java:520)
[10:53:10.836] at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
[10:53:10.836] at java.lang.Thread.run(Thread.java:619)
[10:53:10.836]
[10:53:10.837] [2] HTTP/1.1 500 illegal utf8 encoding at 0xc3, a
[10:53:10.837]
[10:53:10.837] java.io.CharConversionException: illegal utf8 encoding
at 0xc3, a
[10:53:10.837] at com.caucho.vfs.i18n.UTF8Reader.read
(UTF8Reader.java:97)
[10:53:10.837] at com.caucho.vfs.i18n.UTF8Reader.read
(UTF8Reader.java:178)
[10:53:10.837] at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
[10:53:10.837] at com.caucho.vfs.BufferedReaderAdapter.read
(BufferedReaderAdapter.java:64)
[10:53:10.837] at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:2972)
[10:53:10.837] at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
[10:53:10.837] at org.xmlpull.mxp1.MXParser.parseProlog
(MXParser.java:1410)
[10:53:10.837] at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java:
1395)
[10:53:10.837] at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
[10:53:10.837] at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:1078)
[10:53:10.837] at
org.apache.solr.handler.XmlUpdateRequestHandler.update
(XmlUpdateRequestHandler.java:111)
[10:53:10.837] at
org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody
(XmlUpdateRequestHandler.java:84)
[10:53:10.837] at
org.apache.solr.handler.RequestHandlerBase.handleRequest
(RequestHandlerBase.java:77)
[10:53:10.837] at org.apache.solr.core.SolrCore.execute
(SolrCore.java:671)
[10:53:10.837] at org.apache.solr.servlet.SolrDispatchFilter.execute
(SolrDispatchFilter.java:188)
[10:53:10.837] at org.apache.solr.servlet.SolrDispatchFilter.doFilter
(SolrDispatchFilter.java:156)
[10:53:10.837] at
com.caucho.server.dispatch.FilterFilterChain.doFilter
(FilterFilterChain.java:70)
[10:53:10.837] at com.caucho.server.webapp.WebAppFilterChain.doFilter
(WebAppFilterChain.java:173)
[10:53:10.837] at
com.caucho.server.dispatch.ServletInvocation.service
(ServletInvocation.java:229)
[10:53:10.837] at com.caucho.server.http.HttpRequest.handleRequest
(HttpRequest.java:274)
[10:53:10.837] at com.caucho.server.port.TcpConnection.run
(TcpConnection.java:511)
[10:53:10.837] at com.caucho.util.ThreadPool.runTasks
(ThreadPool.java:520)
[10:53:10.837] at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
[10:53:10.837] at java.lang.Thread.run(Thread.java:619)
[10:53:10.837]
--
http://variogr.am/
brian.whitman@variogr.am
Re: UTF-8 problem with Resin
Posted by Ken Krugler <kk...@transpac.com>.
>>>I also have a real world document that doesn't work (from our nutch crawls):
>>>wget http://variogr.am/badfile.txt
>>>./post.sh badfile.txt
>>
>>
>>A solr rock star advised me to try SOLR-214, which fixes the
>>problem. Perhaps he'll illuminate us as to the reasons! But for now
>>be careful with Resin.
>>
>
>I don't know about this "rock star" business!
>
>Brian's setup worked running trunk from ~1 month ago... the major
>character encoding change since then is to use the servlet
>container's getReader() rather then construct it from the stream.
>
>The javadocs are clear that the servlet container needs to handle
>the conversion... but if that is causing problems in the newest
>resin and tomcat, maybe solr should take care of it.
From my experience with Resin, you definitely want to be as explicit
as you can about the character encoding.
-- Ken
--
Ken Krugler
Krugle, Inc.
+1 530-210-6378
"Find Code, Find Answers"
Re: UTF-8 problem with Resin
Posted by Koji Sekiguchi <ko...@r.email.ne.jp>.
The problem was gone. Thank you very much for the handling.
Koji
Ryan McKinley wrote:
> sorry. I tested with something that did not duplicate the problem.
>
> update and try rev 536048.
>
>
> Koji Sekiguchi wrote:
>> Ryan,
>>
>> Thank you for committing SOLR-214, but we are still facing the
>> garbled characters problem
>> under Tomcat 5.5.23.
>>
>> I checked the patch, but unfortunately, ContentStreamBase.getReader()
>> works correctly
>> when using stream.* parameters. Without stream.* parameters,
>> contentType is null and
>> ContentStreamBase.getReader() uses system default encoding.
>>
>> Could you check it please?
>>
>> Best regards,
>>
>> Koji
>>
>>
>>
>
>
Re: UTF-8 problem with Resin
Posted by Ryan McKinley <ry...@gmail.com>.
sorry. I tested with something that did not duplicate the problem.
update and try rev 536048.
Koji Sekiguchi wrote:
> Ryan,
>
> Thank you for committing SOLR-214, but we are still facing the garbled
> characters problem
> under Tomcat 5.5.23.
>
> I checked the patch, but unfortunately, ContentStreamBase.getReader()
> works correctly
> when using stream.* parameters. Without stream.* parameters, contentType
> is null and
> ContentStreamBase.getReader() uses system default encoding.
>
> Could you check it please?
>
> Best regards,
>
> Koji
>
>
>
Re: UTF-8 problem with Resin
Posted by Koji Sekiguchi <ko...@r.email.ne.jp>.
Ryan,
Thank you for committing SOLR-214, but we are still facing the garbled
characters problem
under Tomcat 5.5.23.
I checked the patch, but unfortunately, ContentStreamBase.getReader()
works correctly
when using stream.* parameters. Without stream.* parameters, contentType
is null and
ContentStreamBase.getReader() uses system default encoding.
Could you check it please?
Best regards,
Koji
Re: UTF-8 problem with Resin
Posted by Ryan McKinley <ry...@gmail.com>.
>>
>> I also have a real world document that doesn't work (from our nutch
>> crawls):
>> wget http://variogr.am/badfile.txt
>> ./post.sh badfile.txt
>
>
> A solr rock star advised me to try SOLR-214, which fixes the problem.
> Perhaps he'll illuminate us as to the reasons! But for now be careful
> with Resin.
>
>
I don't know about this "rock star" business!
Brian's setup worked running trunk from ~1 month ago... the major
character encoding change since then is to use the servlet container's
getReader() rather then construct it from the stream.
The javadocs are clear that the servlet container needs to handle the
conversion... but if that is causing problems in the newest resin and
tomcat, maybe solr should take care of it.
Re: UTF-8 problem with Resin
Posted by Brian Whitman <br...@variogr.am>.
On May 7, 2007, at 11:05 AM, Brian Whitman wrote:
> Using resin 3.0.23 with a trunk solr war I am having a problem
> adding documents with utf-8 characters, including the utf8-example
> in exampledocs.
>
> The document simply doesn't get added to Solr. Flat ascii documents
> work fine as does all non-update stuff.
>
> To reproduce:
> install resin 3 and set up solr according to the wiki for resin.
> ./post.sh utf8-example.xml
>
> I also have a real world document that doesn't work (from our nutch
> crawls):
> wget http://variogr.am/badfile.txt
> ./post.sh badfile.txt
A solr rock star advised me to try SOLR-214, which fixes the problem.
Perhaps he'll illuminate us as to the reasons! But for now be careful
with Resin.
-Brian
> I get this in my resin logs.
>
> [10:53:10.834] java.io.CharConversionException: illegal utf8
> encoding at 0xc3, a
> [10:53:10.834] at com.caucho.vfs.i18n.UTF8Reader.read
> (UTF8Reader.java:97)
> [10:53:10.834] at com.caucho.vfs.i18n.UTF8Reader.read
> (UTF8Reader.java:178)
> [10:53:10.834] at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
> [10:53:10.834] at com.caucho.vfs.BufferedReaderAdapter.read
> (BufferedReaderAdapter.java:64)
> [10:53:10.834] at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:
> 2972)
> [10:53:10.834] at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
> [10:53:10.834] at org.xmlpull.mxp1.MXParser.parseProlog
> (MXParser.java:1410)
> [10:53:10.834] at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java:
> 1395)
> [10:53:10.834] at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
> [10:53:10.834] at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:
> 1078)
> [10:53:10.834] at
> org.apache.solr.handler.XmlUpdateRequestHandler.update
> (XmlUpdateRequestHandler.java:111)
> [10:53:10.834] at
> org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody
> (XmlUpdateRequestHandler.java:84)
> [10:53:10.834] at
> org.apache.solr.handler.RequestHandlerBase.handleRequest
> (RequestHandlerBase.java:77)
> [10:53:10.834] at org.apache.solr.core.SolrCore.execute
> (SolrCore.java:671)
> [10:53:10.834] at
> org.apache.solr.servlet.SolrDispatchFilter.execute
> (SolrDispatchFilter.java:188)
> [10:53:10.834] at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter
> (SolrDispatchFilter.java:156)
> [10:53:10.834] at
> com.caucho.server.dispatch.FilterFilterChain.doFilter
> (FilterFilterChain.java:70)
> [10:53:10.834] at
> com.caucho.server.webapp.WebAppFilterChain.doFilter
> (WebAppFilterChain.java:173)
> [10:53:10.834] at
> com.caucho.server.dispatch.ServletInvocation.service
> (ServletInvocation.java:229)
> [10:53:10.834] at com.caucho.server.http.HttpRequest.handleRequest
> (HttpRequest.java:274)
> [10:53:10.834] at com.caucho.server.port.TcpConnection.run
> (TcpConnection.java:511)
> [10:53:10.834] at com.caucho.util.ThreadPool.runTasks
> (ThreadPool.java:520)
> [10:53:10.834] at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
> [10:53:10.834] at java.lang.Thread.run(Thread.java:619)
> [10:53:10.834]
> [10:53:10.835] /update 0 2
> [10:53:10.836] java.io.CharConversionException: illegal utf8
> encoding at 0xc3, a
> [10:53:10.836] at com.caucho.vfs.i18n.UTF8Reader.read
> (UTF8Reader.java:97)
> [10:53:10.836] at com.caucho.vfs.i18n.UTF8Reader.read
> (UTF8Reader.java:178)
> [10:53:10.836] at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
> [10:53:10.836] at com.caucho.vfs.BufferedReaderAdapter.read
> (BufferedReaderAdapter.java:64)
> [10:53:10.836] at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:
> 2972)
> [10:53:10.836] at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
> [10:53:10.836] at org.xmlpull.mxp1.MXParser.parseProlog
> (MXParser.java:1410)
> [10:53:10.836] at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java:
> 1395)
> [10:53:10.836] at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
> [10:53:10.836] at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:
> 1078)
> [10:53:10.836] at
> org.apache.solr.handler.XmlUpdateRequestHandler.update
> (XmlUpdateRequestHandler.java:111)
> [10:53:10.836] at
> org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody
> (XmlUpdateRequestHandler.java:84)
> [10:53:10.836] at
> org.apache.solr.handler.RequestHandlerBase.handleRequest
> (RequestHandlerBase.java:77)
> [10:53:10.836] at org.apache.solr.core.SolrCore.execute
> (SolrCore.java:671)
> [10:53:10.836] at
> org.apache.solr.servlet.SolrDispatchFilter.execute
> (SolrDispatchFilter.java:188)
> [10:53:10.836] at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter
> (SolrDispatchFilter.java:156)
> [10:53:10.836] at
> com.caucho.server.dispatch.FilterFilterChain.doFilter
> (FilterFilterChain.java:70)
> [10:53:10.836] at
> com.caucho.server.webapp.WebAppFilterChain.doFilter
> (WebAppFilterChain.java:173)
> [10:53:10.836] at
> com.caucho.server.dispatch.ServletInvocation.service
> (ServletInvocation.java:229)
> [10:53:10.836] at com.caucho.server.http.HttpRequest.handleRequest
> (HttpRequest.java:274)
> [10:53:10.836] at com.caucho.server.port.TcpConnection.run
> (TcpConnection.java:511)
> [10:53:10.836] at com.caucho.util.ThreadPool.runTasks
> (ThreadPool.java:520)
> [10:53:10.836] at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
> [10:53:10.836] at java.lang.Thread.run(Thread.java:619)
> [10:53:10.836]
> [10:53:10.837] [2] HTTP/1.1 500 illegal utf8 encoding at 0xc3, a
> [10:53:10.837]
> [10:53:10.837] java.io.CharConversionException: illegal utf8
> encoding at 0xc3, a
> [10:53:10.837] at com.caucho.vfs.i18n.UTF8Reader.read
> (UTF8Reader.java:97)
> [10:53:10.837] at com.caucho.vfs.i18n.UTF8Reader.read
> (UTF8Reader.java:178)
> [10:53:10.837] at com.caucho.vfs.ReadStream.read(ReadStream.java:499)
> [10:53:10.837] at com.caucho.vfs.BufferedReaderAdapter.read
> (BufferedReaderAdapter.java:64)
> [10:53:10.837] at org.xmlpull.mxp1.MXParser.fillBuf(MXParser.java:
> 2972)
> [10:53:10.837] at org.xmlpull.mxp1.MXParser.more(MXParser.java:3026)
> [10:53:10.837] at org.xmlpull.mxp1.MXParser.parseProlog
> (MXParser.java:1410)
> [10:53:10.837] at org.xmlpull.mxp1.MXParser.nextImpl(MXParser.java:
> 1395)
> [10:53:10.837] at org.xmlpull.mxp1.MXParser.next(MXParser.java:1093)
> [10:53:10.837] at org.xmlpull.mxp1.MXParser.nextTag(MXParser.java:
> 1078)
> [10:53:10.837] at
> org.apache.solr.handler.XmlUpdateRequestHandler.update
> (XmlUpdateRequestHandler.java:111)
> [10:53:10.837] at
> org.apache.solr.handler.XmlUpdateRequestHandler.handleRequestBody
> (XmlUpdateRequestHandler.java:84)
> [10:53:10.837] at
> org.apache.solr.handler.RequestHandlerBase.handleRequest
> (RequestHandlerBase.java:77)
> [10:53:10.837] at org.apache.solr.core.SolrCore.execute
> (SolrCore.java:671)
> [10:53:10.837] at
> org.apache.solr.servlet.SolrDispatchFilter.execute
> (SolrDispatchFilter.java:188)
> [10:53:10.837] at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter
> (SolrDispatchFilter.java:156)
> [10:53:10.837] at
> com.caucho.server.dispatch.FilterFilterChain.doFilter
> (FilterFilterChain.java:70)
> [10:53:10.837] at
> com.caucho.server.webapp.WebAppFilterChain.doFilter
> (WebAppFilterChain.java:173)
> [10:53:10.837] at
> com.caucho.server.dispatch.ServletInvocation.service
> (ServletInvocation.java:229)
> [10:53:10.837] at com.caucho.server.http.HttpRequest.handleRequest
> (HttpRequest.java:274)
> [10:53:10.837] at com.caucho.server.port.TcpConnection.run
> (TcpConnection.java:511)
> [10:53:10.837] at com.caucho.util.ThreadPool.runTasks
> (ThreadPool.java:520)
> [10:53:10.837] at com.caucho.util.ThreadPool.run(ThreadPool.java:442)
> [10:53:10.837] at java.lang.Thread.run(Thread.java:619)
> [10:53:10.837]
>
>
>
> --
> http://variogr.am/
> brian.whitman@variogr.am
>
>
>