You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tomcat.apache.org by co...@apache.org on 2001/05/26 19:12:05 UTC

cvs commit: jakarta-tomcat/src/share/org/apache/tomcat/util/buf UTF8Decoder.java B2CConverter.java

costin      01/05/26 10:12:05

  Added:       src/share/org/apache/tomcat/util/buf UTF8Decoder.java
                        B2CConverter.java
  Log:
  Added a generic Byte->Char (charset) decoder, using the same model as
  the C2BConverter used by OutputBuffer.
  
  The input system is still very inefficient, this is just the first step
  ( not used in active code - I did a lot of tests but it's better to be
  safe for now, and make sure we don't have regressions with the other changes
  before activating this one ).
  
  UTF8Decoder used to be part of ByteChunk, it was fixed and tested with
  multi-byte chars.
  
  The decoding was refactored for 3 reasons:
  1. Performance. This way we can optimize individual converters and plug
  specialized versions ( like UTF8 )
  2. We should be able to use ( when detected ) nio converters
  3. Clarity. It's ( IMHO ) easier to follow the Chunk code without all the
  extra ( and complex ) decoding code in.
  
  Both clases are not in use in the current code, but should be enabled
  for 3.3 - the code is stable and efficient.
  
  Revision  Changes    Path
  1.1                  jakarta-tomcat/src/share/org/apache/tomcat/util/buf/UTF8Decoder.java
  
  Index: UTF8Decoder.java
  ===================================================================
  /*
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   * [Additional notices, if required by prior licensing conditions]
   *
   */ 
  
  package org.apache.tomcat.util.buf;
  
  import java.text.*;
  import java.util.*;
  import java.io.Serializable;
  import java.io.IOException;
  import org.apache.tomcat.util.buf.*;
  
  /**
   * Moved from ByteChunk - code to convert from UTF8 bytes to chars.
   * Not used in the current tomcat3.3 : the performance gain is not very
   * big if the String is created, only if we avoid that and work only
   * on char[]. Until than, it's better to be safe. ( I tested this code
   * with 2 and 3 bytes chars, and it works fine in xerces )
   * 
   * Cut from xerces' UTF8Reader.copyMultiByteCharData() 
   *
   * @author Costin Manolache
   * @author ( Xml-Xerces )
   */
  public final class UTF8Decoder extends B2CConverter {
      // may have state !!
      
      public UTF8Decoder() {
  
      }
      
      public void recycle() {
      }
  
      public void convert(ByteChunk mb, CharChunk cb )
  	throws IOException
      {
  	int bytesOff=mb.getOffset();
  	int bytesLen=mb.getLength();
  	byte bytes[]=mb.getBytes();
  	
  	int j=bytesOff;
  	int end=j+bytesLen;
  
  	while( j< end ) {
  	    int b0=0xff & bytes[j];
  
  	    if( (b0 & 0x80) == 0 ) {
  		cb.append((char)b0);
  		j++;
  		continue;
  	    }
  	    
  	    // 2 byte ?
  	    if( j++ >= end ) {
  		// ok, just ignore - we could throw exception
  		throw new IOException( "Conversion error - EOF " );
  	    }
  	    int b1=0xff & bytes[j];
  	    
  	    // ok, let's the fun begin - we're handling UTF8
  	    if ((0xe0 & b0) == 0xc0) { // 110yyyyy 10xxxxxx (0x80 to 0x7ff)
  		int ch = ((0x1f & b0)<<6) + (0x3f & b1);
  		if(debug>0)
  		    log("Convert " + b0 + " " + b1 + " " + ch + ((char)ch));
  		
  		cb.append((char)ch);
  		j++;
  		continue;
  	    }
  	    
  	    if( j++ >= end ) 
  		return ;
  	    int b2=0xff & bytes[j];
  	    
  	    if( (b0 & 0xf0 ) == 0xe0 ) {
  		if ((b0 == 0xED && b1 >= 0xA0) ||
  		    (b0 == 0xEF && b1 == 0xBF && b2 >= 0xBE)) {
  		    if(debug>0)
  			log("Error " + b0 + " " + b1+ " " + b2 );
  
  		    throw new IOException( "Conversion error 2"); 
  		}
  
  		int ch = ((0x0f & b0)<<12) + ((0x3f & b1)<<6) + (0x3f & b2);
  		cb.append((char)ch);
  		if(debug>0)
  		    log("Convert " + b0 + " " + b1+ " " + b2 + " " + ch +
  			((char)ch));
  		j++;
  		continue;
  	    }
  
  	    if( j++ >= end ) 
  		return ;
  	    int b3=0xff & bytes[j];
  
  	    if (( 0xf8 & b0 ) == 0xf0 ) {
  		if (b0 > 0xF4 || (b0 == 0xF4 && b1 >= 0x90)) {
  		    if(debug>0)
  			log("Convert " + b0 + " " + b1+ " " + b2 + " " + b3);
  		    throw new IOException( "Conversion error ");
  		}
  		int ch = ((0x0f & b0)<<18) + ((0x3f & b1)<<12) +
  		    ((0x3f & b2)<<6) + (0x3f & b3);
  
  		if(debug>0)
  		    log("Convert " + b0 + " " + b1+ " " + b2 + " " + b3 + " " +
  			ch + ((char)ch));
  
  		if (ch < 0x10000) {
  		    cb.append( (char)ch );
  		} else {
  		    cb.append((char)(((ch-0x00010000)>>10)+
  						   0xd800));
  		    cb.append((char)(((ch-0x00010000)&0x3ff)+
  						   0xdc00));
  		}
  		j++;
  		continue;
  	    } else {
  		// XXX Throw conversion exception !!!
  		if(debug>0)
  		    log("Convert " + b0 + " " + b1+ " " + b2 + " " + b3);
  		throw new IOException( "Conversion error 4" );
  	    }
  	}
      }
  
      private static int debug=1;
      void log(String s ) {
  	System.out.println("UTF8Decoder: " + s );
      }
      
  }
  
  
  
  1.1                  jakarta-tomcat/src/share/org/apache/tomcat/util/buf/B2CConverter.java
  
  Index: B2CConverter.java
  ===================================================================
  /*
   * ====================================================================
   *
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 1999 The Apache Software Foundation.  All rights 
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution, if
   *    any, must include the following acknowlegement:  
   *       "This product includes software developed by the 
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowlegement may appear in the software itself,
   *    if and wherever such third-party acknowlegements normally appear.
   *
   * 4. The names "The Jakarta Project", "Tomcat", and "Apache Software
   *    Foundation" must not be used to endorse or promote products derived
   *    from this software without prior written permission. For written 
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache"
   *    nor may "Apache" appear in their names without prior written
   *    permission of the Apache Group.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   *
   * [Additional notices, if required by prior licensing conditions]
   *
   */ 
  
  
  package org.apache.tomcat.util.buf;
  
  import org.apache.tomcat.util.buf.*;
  
  import java.io.*;
  
  /** Efficient conversion of bytes  to character .
   *  
   *  This uses the standard JDK mechansim - a reader - but provides mechanisms
   *  to recycle all the objects that are used. It is compatible with JDK1.1
   *  and up,
   *  ( nio is better, but it's not available even in 1.2 or 1.3 )
   *
   *  Not used in the current code, the performance gain is not very big
   *  in the current case ( since String is created anyway ), but it will
   *  be used in a later version or after the remaining optimizations.
   */
  public class B2CConverter {
      private IntermediateInputStream iis;
      private ReadConvertor conv;
      private String encoding;
  
      protected B2CConverter() {
      }
      
      /** Create a converter, with bytes going to a byte buffer
       */
      public B2CConverter(String encoding)
  	throws IOException
      {
  	this.encoding=encoding;
  	reset();
      }
  
      
      /** Reset the internal state, empty the buffers.
       *  The encoding remain in effect, the internal buffers remain allocated.
       */
      public  void recycle() {
  	conv.recycle();
      }
  
      static final int BUFFER_SIZE=8192;
      char result[]=new char[BUFFER_SIZE];
  
      /** Convert a buffer of bytes into a chars
       */
      public  void convert( ByteChunk bb, CharChunk cb )
  	throws IOException
      {
  	// Set the ByteChunk as input to the Intermediate reader
  	iis.setByteChunk( bb );
  	convert(cb);
      }
  
      private void convert(CharChunk cb)
  	throws IOException
      {
  	try {
  	    // read from the reader
  	    while( true ) { // conv.ready() ) {
  		int cnt=conv.read( result, 0, BUFFER_SIZE );
  		if( cnt <= 0 ) {
  		    // End of stream ! - we may be in a bad state
  		    if( debug>0)
  			log( "EOF" );
  		    //		    reset();
  		    return;
  		}
  		if( debug > 1 )
  		    log("Converted: " + new String( result, 0, cnt ));
  
  		// XXX go directly
  		cb.append( result, 0, cnt );
  	    }
  	} catch( IOException ex) {
  	    if( debug>0)
  		log( "Reseting the converter " + ex.toString() );
  	    reset();
  	    throw ex;
  	}
      }
  
      public void reset()
  	throws IOException
      {
  	// destroy the reader/iis
  	iis=new IntermediateInputStream();
  	conv=new ReadConvertor( iis, encoding );
      }
  
      private final int debug=0;
      void log( String s ) {
  	System.out.println("B2CConverter: " + s );
      }
  
      // -------------------- Not used - the speed improvemnt is quite small
  
      /*
      private Hashtable decoders;
      public static final boolean useNewString=false;
      public static final boolean useSpecialDecoders=true;
      private UTF8Decoder utfD;
      // private char[] conversionBuff;
      CharChunk conversionBuf;
  
  
      private  static String decodeString(ByteChunk mb, String enc)
  	throws IOException
      {
  	byte buff=mb.getBuffer();
  	int start=mb.getStart();
  	int end=mb.getEnd();
  	if( useNewString ) {
  	    if( enc==null) enc="UTF8";
  	    return new String( buff, start, end-start, enc );
  	}
  	B2CConverter b2c=null;
  	if( useSpecialDecoders &&
  	    (enc==null || "UTF8".equalsIgnoreCase(enc))) {
  	    if( utfD==null ) utfD=new UTF8Decoder();
  	    b2c=utfD;
  	}
  	if(decoders == null ) decoders=new Hashtable();
  	if( enc==null ) enc="UTF8";
  	b2c=(B2CConverter)decoders.get( enc );
  	if( b2c==null ) {
  	    if( useSpecialDecoders ) {
  		if( "UTF8".equalsIgnoreCase( enc ) ) {
  		    b2c=new UTF8Decoder();
  		}
  	    }
  	    if( b2c==null )
  		b2c=new B2CConverter( enc );
  	    decoders.put( enc, b2c );
  	}
  	if( conversionBuf==null ) conversionBuf=new CharChunk(1024);
  
  	try {
  	    conversionBuf.recycle();
  	    b2c.convert( this, conversionBuf );
  	    //System.out.println("XXX 1 " + conversionBuf );
  	    return conversionBuf.toString();
  	} catch( IOException ex ) {
  	    ex.printStackTrace();
  	    return null;
  	}
      }
  
      */
  }
  
  // -------------------- Private implementation --------------------
  
  
  
  /**
   * 
   */
  final class  ReadConvertor extends InputStreamReader {
      // stream with flush() and close(). overriden.
      private IntermediateInputStream iis;
      
      // Has a private, internal byte[8192]
      
      /** Create a converter.
       */
      public ReadConvertor( IntermediateInputStream in, String enc )
  	throws UnsupportedEncodingException
      {
  	super( in, enc );
  	iis=in;
      }
      
      /** Overriden - will do nothing but reset internal state.
       */
      public  final void close() throws IOException {
  	// NOTHING
  	// Calling super.close() would reset out and cb.
      }
      
      public  final int read(char cbuf[], int off, int len)
  	throws IOException
      {
  	// will do the conversion and call write on the output stream
  	return super.read( cbuf, off, len );
      }
      
      /** Reset the buffer
       */
      public  final void recycle() {
      }
  }
  
  
  /** Special output stream where close() is overriden, so super.close()
      is never called.
      
      This allows recycling. It can also be disabled, so callbacks will
      not be called if recycling the converter and if data was not flushed.
  */
  final class IntermediateInputStream extends InputStream {
      byte buf[];
      int pos;
      int len;
      int end;
      
      public IntermediateInputStream() {
      }
      
      public  final void close() throws IOException {
  	// shouldn't be called - we filter it out in writer
  	throw new IOException("close() called - shouldn't happen ");
      }
      
      public  final  int read(byte cbuf[], int off, int len) throws IOException {
  	if( pos >= end ) return -1;
  	if (pos + len > end) {
  	    len = end - pos;
  	}
  	if (len <= 0) {
  	    return 0;
  	}
  	System.arraycopy(buf, pos, cbuf, off, len);
  	pos += len;
  	return len;
      }
      
      public  final int read() throws IOException {
  	return (pos < end ) ? (buf[pos++] & 0xff) : -1;
      }
  
      // -------------------- Internal methods --------------------
  
      void setBuffer( byte b[], int p, int l ) {
  	buf=b;
  	pos=p;
  	len=l;
  	end=pos+len;
      }
  
      void setByteChunk( ByteChunk mb ) {
  	buf=mb.getBytes();
  	pos=mb.getStart();
  	len=mb.getLength();
  	end=pos+len;
      }
  
  }