You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tomcat.apache.org by co...@apache.org on 2001/03/10 19:18:43 UTC
cvs commit: jakarta-tomcat/src/share/org/apache/tomcat/util/buf ByteChunk.java
costin 01/03/10 10:18:43
Modified: src/share/org/apache/tomcat/util/buf ByteChunk.java
Log:
First attempt to fix the UTF decoding bug.
This doesn't change the behavior - what worked before should work the
same ( i.e. bytes<0x80 ). For chars > 0x80 ( that didn't worked with
the previous code ) I cut&pasted code from xerces.
The idea is to avoid using the (memory expensive) String( bytes[] ),
which proved to be a performance problem in many cases.
Xerces is doing exactly the same thing to optimize the conversion.
This works for UTF8 - for all other encodings the slow method is used.
There is another way to resolve the problem ( a trick similar with
OutputBuffer ), and we may implement it someday, but the hope is that
people will start using unicode as a clean and simple solution for
non-ascii charsets.
Netscape and IIS and most platforms already support UTF. ( that doesn't mean
we shouldn't deal with the other encodings )
( the code is untested - I need UTF examples, adding files in
various encoding with "strange" names to the test suite is still on the
todo list )
Revision Changes Path
1.2 +85 -4 jakarta-tomcat/src/share/org/apache/tomcat/util/buf/ByteChunk.java
Index: ByteChunk.java
===================================================================
RCS file: /home/cvs/jakarta-tomcat/src/share/org/apache/tomcat/util/buf/ByteChunk.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -r1.1 -r1.2
--- ByteChunk.java 2001/02/20 03:12:13 1.1
+++ ByteChunk.java 2001/03/10 18:18:43 1.2
@@ -174,13 +174,94 @@
conversionBuff=new char[bytesLen];
}
+ // int j=bytesOff;
+ // for( int i=0; i< bytesLen; i++ ) {
+ // conversionBuff[i]=(char)bytes[j++];
+ // }
+ int charsLen=byte2charUTF8();
+ if( charsLen==-1) return null; // or throw exception
+
+ return new String( conversionBuff, 0, charsLen);
+ }
+
+
+ private int byte2charUTF8() {
+ if( conversionBuff==null || bytesLen > conversionBuff.length ) {
+ conversionBuff=new char[bytesLen];
+ }
+ // source: xerces' UTF8Reader.copyMultiByteCharData()
int j=bytesOff;
- for( int i=0; i< bytesLen; i++ ) {
- conversionBuff[i]=(char)bytes[j++];
+ int charOff=0;
+ int end=j+bytesLen;
+ while( j< end ) {
+ int b0=(int)bytes[j];
+
+ if( b0 < 0x80 ) {
+ conversionBuff[charOff]=(char)b0;
+ charOff++;
+ j++;
+ continue;
+ }
+ // 2 byte ?
+ if( j++ >= end ) {
+ // ok, just ignore - we could throw exception
+ return -1;
+ }
+ int b1=(int)bytes[j];
+
+ // ok, let's the fun begin - we're handling UTF8
+ if ((0xe0 & b0) == 0xc0) { // 110yyyyy 10xxxxxx (0x80 to 0x7ff)
+ int ch = ((0x1f & b0)<<6) + (0x3f & b1);
+ conversionBuff[charOff]=(char)ch;
+ charOff++;
+ continue;
+ }
+
+ if( j++ >= end )
+ return -1;
+ int b2=(int)bytes[j];
+
+ if( (b0 & 0xf0 ) == 0xe0 ) {
+ if ((b0 == 0xED && b1 >= 0xA0) ||
+ (b0 == 0xEF && b1 == 0xBF && b2 >= 0xBE)) {
+ return -1;
+ }
+
+ int ch = ((0x0f & b0)<<12) + ((0x3f & b1)<<6) + (0x3f & b2);
+ conversionBuff[charOff]=(char)ch;
+ charOff++;
+ continue;
+ }
+
+ if( j++ >= end )
+ return -1;
+ int b3=(int)bytes[j];
+
+ if (( 0xf8 & b0 ) == 0xf0 ) {
+ if (b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90))
+ return -1;
+ int ch = ((0x0f & b0)<<18) + ((0x3f & b1)<<12) +
+ ((0x3f & b2)<<6) + (0x3f & b3);
+
+ if (ch < 0x10000) {
+ conversionBuff[charOff]=(char)ch;
+ charOff++;
+ } else {
+ conversionBuff[charOff]=(char)(((ch-0x00010000)>>10)+
+ 0xd800);
+ charOff++;
+ conversionBuff[charOff]=(char)(((ch-0x00010000)&0x3ff)+
+ 0xdc00);
+ charOff++;
+ }
+ continue;
+ } else {
+ return -1;
+ }
}
- return new String( conversionBuff, 0, bytesLen);
+ return charOff;
}
-
+
public int getInt()
{
return Ascii.parseInt(bytes, bytesOff,bytesLen);
---------------------------------------------------------------------
To unsubscribe, e-mail: tomcat-dev-unsubscribe@jakarta.apache.org
For additional commands, email: tomcat-dev-help@jakarta.apache.org