You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tomcat.apache.org by co...@apache.org on 2001/03/10 19:18:43 UTC
cvs commit: jakarta-tomcat/src/share/org/apache/tomcat/util/buf ByteChunk.java

costin      01/03/10 10:18:43

  Modified:    src/share/org/apache/tomcat/util/buf ByteChunk.java
  Log:
  First attempt to fix the UTF decoding bug.
  
  This doesn't change the behavior - what worked before should work the
  same ( i.e. bytes<0x80 ). For chars > 0x80 ( that didn't worked with
  the previous code ) I cut&pasted code from xerces.
  
  The idea is to avoid using the (memory expensive) String( bytes[] ),
  which proved to be a performance problem in many cases.
  
  Xerces is doing exactly the same thing to optimize the conversion.
  
  This works for UTF8 - for all other encodings the slow method is used.
  There is another way to resolve the problem ( a trick similar with
  OutputBuffer ), and we may implement it someday, but the hope is that
  people will start using unicode as a clean and simple solution for
  non-ascii charsets.
  
  Netscape and IIS and most platforms already support UTF. ( that doesn't mean
  we shouldn't deal with the other encodings )
  
  ( the code is untested - I need UTF examples, adding files in
  various encoding with "strange" names to the test suite is still on the
  todo list )
  
  Revision  Changes    Path
  1.2       +85 -4     jakarta-tomcat/src/share/org/apache/tomcat/util/buf/ByteChunk.java
  
  Index: ByteChunk.java
  ===================================================================
  RCS file: /home/cvs/jakarta-tomcat/src/share/org/apache/tomcat/util/buf/ByteChunk.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ByteChunk.java	2001/02/20 03:12:13	1.1
  +++ ByteChunk.java	2001/03/10 18:18:43	1.2
  @@ -174,13 +174,94 @@
   	    conversionBuff=new char[bytesLen];
   	}
   
  +	// 	int j=bytesOff;
  +	// 	for( int i=0; i< bytesLen; i++ ) {
  +	// 	    conversionBuff[i]=(char)bytes[j++];
  +	// 	}
  +	int charsLen=byte2charUTF8();
  +	if( charsLen==-1) return null; // or throw exception
  +	
  +	return new String( conversionBuff, 0, charsLen);
  +    }
  +
  +
  +    private int byte2charUTF8() {
  +	if( conversionBuff==null || bytesLen > conversionBuff.length ) {
  +	    conversionBuff=new char[bytesLen];
  +	}
  +	// source: xerces' UTF8Reader.copyMultiByteCharData() 
   	int j=bytesOff;
  -	for( int i=0; i< bytesLen; i++ ) {
  -	    conversionBuff[i]=(char)bytes[j++];
  +	int charOff=0;
  +	int end=j+bytesLen;
  +	while( j< end ) {
  +	    int b0=(int)bytes[j];
  +	    
  +	    if( b0 < 0x80 ) { 
  +		conversionBuff[charOff]=(char)b0;
  +		charOff++;
  +		j++;
  +		continue;
  +	    }
  +	    // 2 byte ?
  +	    if( j++ >= end ) {
  +		// ok, just ignore - we could throw exception
  +		return -1;
  +	    }
  +	    int b1=(int)bytes[j];
  +	    
  +	    // ok, let's the fun begin - we're handling UTF8
  +	    if ((0xe0 & b0) == 0xc0) { // 110yyyyy 10xxxxxx (0x80 to 0x7ff)
  +		int ch = ((0x1f & b0)<<6) + (0x3f & b1);
  +		conversionBuff[charOff]=(char)ch;
  +		charOff++;
  +		continue;
  +	    }
  +	    
  +	    if( j++ >= end ) 
  +		return -1;
  +	    int b2=(int)bytes[j];
  +	    
  +	    if( (b0 & 0xf0 ) == 0xe0 ) {
  +		if ((b0 == 0xED && b1 >= 0xA0) ||
  +		    (b0 == 0xEF && b1 == 0xBF && b2 >= 0xBE)) {
  +		    return -1;
  +		}
  +
  +		int ch = ((0x0f & b0)<<12) + ((0x3f & b1)<<6) + (0x3f & b2);
  +		conversionBuff[charOff]=(char)ch;
  +		charOff++;
  +		continue;
  +	    }
  +
  +	    if( j++ >= end ) 
  +		return -1;
  +	    int b3=(int)bytes[j];
  +
  +	    if (( 0xf8 & b0 ) == 0xf0 ) {
  +		if (b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90))
  +		    return -1;
  +		int ch = ((0x0f & b0)<<18) + ((0x3f & b1)<<12) +
  +		    ((0x3f & b2)<<6) + (0x3f & b3);
  +
  +		if (ch < 0x10000) {
  +		    conversionBuff[charOff]=(char)ch;
  +		    charOff++;
  +		} else {
  +		    conversionBuff[charOff]=(char)(((ch-0x00010000)>>10)+
  +						   0xd800);
  +		    charOff++;
  +		    conversionBuff[charOff]=(char)(((ch-0x00010000)&0x3ff)+
  +						   0xdc00);
  +		    charOff++;
  +		}
  +		continue;
  +	    } else {
  +		return -1;
  +	    }
   	}
  -	return new String( conversionBuff, 0, bytesLen);
  +	return charOff;
       }
  -
  +    
       public int getInt()
       {
   	return Ascii.parseInt(bytes, bytesOff,bytesLen);
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: tomcat-dev-unsubscribe@jakarta.apache.org
For additional commands, email: tomcat-dev-help@jakarta.apache.org