You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by ne...@apache.org on 2002/01/05 00:27:50 UTC

cvs commit: xml-xerces/java/src/org/apache/xerces/impl/io UCSReader.java

neilg       02/01/04 15:27:50

  Modified:    java/docs releases.xml
               java/src/org/apache/xerces/impl XMLEntityManager.java
               java/src/org/apache/xerces/impl/msg XMLMessages.properties
               java/src/org/apache/xerces/util EncodingMap.java
  Added:       java/src/org/apache/xerces/impl/io UCSReader.java
  Log:
  implemented support for UCS-2 and UCS-4 encodings by the use of a custom Reader class.  Still need to revisit performance issues related to reading input character-by-character when that is not necessary.
  
  Revision  Changes    Path
  1.96      +5 -1      xml-xerces/java/docs/releases.xml
  
  Index: releases.xml
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/docs/releases.xml,v
  retrieving revision 1.95
  retrieving revision 1.96
  diff -u -r1.95 -r1.96
  --- releases.xml	22 Dec 2001 23:25:48 -0000	1.95
  +++ releases.xml	4 Jan 2002 23:27:49 -0000	1.96
  @@ -1,11 +1,15 @@
   <?xml version='1.0' encoding='UTF-8'?>
  -<!-- $Id: releases.xml,v 1.95 2001/12/22 23:25:48 andyc Exp $ -->
  +<!-- $Id: releases.xml,v 1.96 2002/01/04 23:27:49 neilg Exp $ -->
   <!DOCTYPE releases SYSTEM 'dtd/releases.dtd'>
   <releases>
    <release version='NOT YET RELEASED'>
     <desc>
     </desc>
     <changes>
  +   <add>
  +    <note>Implemented support for UCS-4 and UCS-2 encodings.</note>
  +    <submitter name='Neil Graham'/>
  +   </add>
      <add>
       <note>Added internal subset string to DOM.</note>
       <submitter name='Andy Clark'/>
  
  
  
  1.16      +99 -37    xml-xerces/java/src/org/apache/xerces/impl/XMLEntityManager.java
  
  Index: XMLEntityManager.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLEntityManager.java,v
  retrieving revision 1.15
  retrieving revision 1.16
  diff -u -r1.15 -r1.16
  --- XMLEntityManager.java	18 Dec 2001 16:58:45 -0000	1.15
  +++ XMLEntityManager.java	4 Jan 2002 23:27:49 -0000	1.16
  @@ -71,6 +71,7 @@
   
   import org.apache.xerces.impl.XMLErrorReporter;
   import org.apache.xerces.impl.io.ASCIIReader;
  +import org.apache.xerces.impl.io.UCSReader;
   import org.apache.xerces.impl.io.UTF8Reader;
   import org.apache.xerces.impl.msg.XMLMessageFormatter;
   
  @@ -112,7 +113,7 @@
    * @author Andy Clark, IBM
    * @author Arnaud  Le Hors, IBM
    *
  - * @version $Id: XMLEntityManager.java,v 1.15 2001/12/18 16:58:45 neilg Exp $
  + * @version $Id: XMLEntityManager.java,v 1.16 2002/01/04 23:27:49 neilg Exp $
    */
   public class XMLEntityManager
       implements XMLComponent, XMLEntityResolver {
  @@ -703,6 +704,7 @@
           final String systemId = xmlInputSource.getSystemId();
           String baseSystemId = xmlInputSource.getBaseSystemId();
           String encoding = xmlInputSource.getEncoding();
  +        Boolean isBigEndian = null;
   
           // create reader
           InputStream stream = null;
  @@ -728,7 +730,9 @@
                       b4[count] = (byte)stream.read();
                   }
                   if (count == 4) {
  -                    encoding = getEncodingName(b4, count);
  +                    Object [] encodingDesc = getEncodingName(b4, count);
  +                    encoding = (String)(encodingDesc[0]);
  +                    isBigEndian = (Boolean)(encodingDesc[1]);
   
                       // removed use of pushback inputstream--neilg
                       /*****
  @@ -765,18 +769,18 @@
                       //          indirection to get at the underlying bytes. -Ac
   
                       // create reader from input stream
  -                    reader = createReader(new RewindableInputStream(pbstream), encoding);
  +                    reader = createReader(new RewindableInputStream(pbstream), encoding, isBigEndian);
                       ******/
  -                    reader = createReader(stream, encoding);
  +                    reader = createReader(stream, encoding, isBigEndian);
                   }
                   else {
  -                    reader = createReader(stream, encoding);
  +                    reader = createReader(stream, encoding, isBigEndian);
                   }
               }
   
               // use specified encoding
               else {
  -                reader = createReader(stream, encoding);
  +                reader = createReader(stream, encoding, isBigEndian);
               }
   
               // read one character at a time so we don't jump too far
  @@ -1136,15 +1140,18 @@
   
       /**
        * Returns the IANA encoding name that is auto-detected from
  -     * the bytes specified.
  +     * the bytes specified, with the endian-ness of that encoding where appropriate.
        *
        * @param b4    The first four bytes of the input.
        * @param count The number of bytes actually read.
  +     * @return a 2-element array:  the first element, an IANA-encoding string, 
  +     *  the second element a Boolean which is true iff the document is big endian, false
  +     *  if it's little-endian, and null if the distinction isn't relevant.
        */
  -    protected String getEncodingName(byte[] b4, int count) {
  +    protected Object[] getEncodingName(byte[] b4, int count) {
   
           if (count < 2) {
  -            return "UTF-8";
  +            return new Object[]{"UTF-8", null};
           }
   
           // UTF-16, with BOM
  @@ -1152,72 +1159,72 @@
           int b1 = b4[1] & 0xFF;
           if (b0 == 0xFE && b1 == 0xFF) {
               // UTF-16, big-endian
  -            return "UTF-16";
  +            return new Object [] {"UTF-16BE", new Boolean(true)};
           }
           if (b0 == 0xFF && b1 == 0xFE) {
               // UTF-16, little-endian
  -            return "UTF-16";
  +            return new Object [] {"UTF-16LE", new Boolean(false)};
           }
   
           // default to UTF-8 if we don't have enough bytes to make a
           // good determination of the encoding
           if (count < 3) {
  -            return "UTF-8";
  +            return new Object [] {"UTF-8", null};
           }
   
           // UTF-8 with a BOM
           int b2 = b4[2] & 0xFF;
           if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
  -            return "UTF-8";
  +            return new Object [] {"UTF-8", null};
           }
   
           // default to UTF-8 if we don't have enough bytes to make a
           // good determination of the encoding
           if (count < 4) {
  -            return "UTF-8";
  +            return new Object [] {"UTF-8", null};
           }
   
           // other encodings
           int b3 = b4[3] & 0xFF;
           if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
               // UCS-4, big endian (1234)
  -            // REVISIT: What should this be?
  -            return "UnicodeBig";
  +            return new Object [] {"ISO-10646-UCS-4", new Boolean(true)};
           }
           if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
               // UCS-4, little endian (4321)
  -            // REVISIT: What should this be?
  -            return "UnicodeLittleUnmarked";
  +            return new Object [] {"ISO-10646-UCS-4", new Boolean(false)};
           }
           if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
               // UCS-4, unusual octet order (2143)
               // REVISIT: What should this be?
  -            return "UnicodeBigUnmarked";
  +            return new Object [] {"ISO-10646-UCS-4", null};
           }
           if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
               // UCS-4, unusual octect order (3412)
               // REVISIT: What should this be?
  -            return "UnicodeLittleUnmarked";
  +            return new Object [] {"ISO-10646-UCS-4", null};
           }
           if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
               // UTF-16, big-endian, no BOM
  +            // (or could turn out to be UCS-2...
               // REVISIT: What should this be?
  -            return "UnicodeBig";
  +            return new Object [] {"UTF-16BE", new Boolean(true)};
           }
           if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
               // UTF-16, little-endian, no BOM
  -            return "UnicodeLittle";
  +            // (or could turn out to be UCS-2...
  +            return new Object [] {"UTF-16LE", new Boolean(false)};
           }
           if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
               // EBCDIC
               // a la xerces1, return CP037 instead of EBCDIC here
  -            return "CP037";
  +            return new Object [] {"CP037", null};
           }
   
           // default encoding
  -        return "UTF-8";
  +        return new Object [] {"UTF-8", null};
   
  -    } // getEncodingName(byte[],int):String
  +    } // getEncodingName(byte[],int):Object[]
   
       /**
        * Creates a reader capable of reading the given input stream in
  @@ -1229,10 +1236,13 @@
        *                     Java encoding names are allowed, then the
        *                     encoding name may be a Java encoding name;
        *                     otherwise, it is an ianaEncoding name.
  +     * @param isBigEndian   For encodings (like uCS-4), whose names cannot
  +     *                      specify a byte order, this tells whether the order is bigEndian.  null menas 
  +     *                      unknown or not relevant.
        *
        * @return Returns a reader.
        */
  -    protected Reader createReader(InputStream inputStream, String encoding)
  +    protected Reader createReader(InputStream inputStream, String encoding, Boolean isBigEndian)
           throws IOException {
   
           // normalize encoding name
  @@ -1254,6 +1264,36 @@
               }
               return new ASCIIReader(inputStream, fBufferSize);
           }
  +        if(ENCODING.equals("ISO-10646-UCS-4")) {
  +            if(isBigEndian != null) {
  +                boolean isBE = isBigEndian.booleanValue();
  +                if(isBE) {
  +                    return new UCSReader(inputStream, UCSReader.UCS4BE);
  +                } else {
  +                    return new UCSReader(inputStream, UCSReader.UCS4LE);
  +                }
  +            } else {
  +                fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
  +                                       "EncodingByteOrderUnsupported",
  +                                       new Object[] { encoding },
  +                                       XMLErrorReporter.SEVERITY_FATAL_ERROR);
  +            }
  +        }
  +        if(ENCODING.equals("ISO-10646-UCS-2")) {
  +            if(isBigEndian != null) { // sould never happen with this encoding...
  +                boolean isBE = isBigEndian.booleanValue();
  +                if(isBE) {
  +                    return new UCSReader(inputStream, UCSReader.UCS2BE);
  +                } else {
  +                    return new UCSReader(inputStream, UCSReader.UCS2LE);
  +                }
  +            } else {
  +                fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
  +                                       "EncodingByteOrderUnsupported",
  +                                       new Object[] { encoding },
  +                                       XMLErrorReporter.SEVERITY_FATAL_ERROR);
  +            }
  +        }
   
           // check for valid name
           boolean validIANA = XMLChar.isValidIANAEncoding(encoding);
  @@ -1297,7 +1337,7 @@
           }
           return new InputStreamReader(inputStream, javaEncoding);
   
  -    } // createReader(InputStream,String):
  +    } // createReader(InputStream,String, Boolean): Reader
   
       //
       // Protected static methods
  @@ -1781,14 +1821,38 @@
                   //       a single char! -Ac
                   if (fCurrentEntity.encoding == null ||
                       !fCurrentEntity.encoding.equals(encoding)) {
  +                    // UTF-16 is a bit of a special case.  If the encoding is UTF-16,
  +                    // and we know the endian-ness, we shouldn't change readers.
  +                    // If it's ISO-10646-UCS-(2|4), then we'll have to deduce
  +                    // the endian-ness from the encoding we presently have.
  +                    if(fCurrentEntity.encoding != null && fCurrentEntity.encoding.startsWith("UTF-16")) {
  +                        String ENCODING = encoding.toUpperCase();
  +                        if(ENCODING.equals("UTF-16")) return;
  +                        if(ENCODING.equals("ISO-10646-UCS-4")) {
  +                            if(fCurrentEntity.encoding.equals("UTF-16BE")) {
  +                                fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS4BE);
  +                            } else {
  +                                fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS4LE);
  +                            }
  +                            return;
  +                        }
  +                        if(ENCODING.equals("ISO-10646-UCS-2")) {
  +                            if(fCurrentEntity.encoding.equals("UTF-16BE")) {
  +                                fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS2BE);
  +                            } else {
  +                                fCurrentEntity.reader = new UCSReader(fCurrentEntity.stream, UCSReader.UCS2LE);
  +                            }
  +                            return;
  +                        }
  +                    }
                       // wrap a new reader around the input stream, changing
                       // the encoding
                       if (DEBUG_ENCODINGS) {
                           System.out.println("$$$ creating new reader from stream: "+
  -                                           fCurrentEntity.stream);
  +                                        fCurrentEntity.stream);
                       }
                       //fCurrentEntity.stream.reset();
  -                    fCurrentEntity.reader = createReader(fCurrentEntity.stream, encoding);
  +                    fCurrentEntity.reader = createReader(fCurrentEntity.stream, encoding, null);
                   } else {
                       if (DEBUG_ENCODINGS) 
                           System.out.println("$$$ reusing old reader on stream");
  @@ -3155,10 +3219,9 @@
           }
       
           public int read() throws IOException {
  -            int b;
  +            int b = 0;
               if (fOffset < fLength) {
  -                System.err.println("REturned buffered:  " + (char)(fData[fOffset++] & 0xFF));
  -                return fData[fOffset++] & 0xFF;
  +                return fData[fOffset++] & 0xff;
               }
               if (fOffset == fEndOffset) {
                   return -1;
  @@ -3175,7 +3238,7 @@
               }
               fData[fLength++] = (byte)b;
               fOffset++;
  -            return b;
  +            return b & 0xff;
           }
   
           public int read(byte[] b, int off, int len) throws IOException {
  @@ -3188,13 +3251,12 @@
                   if(fCurrentEntity.mayReadChunks) {
                       return fInputStream.read(b, off, len);
                   }
  -                b[off] = (byte)read();
  -                if(b[off] == -1) {
  +                int returnedVal = read();
  +                if(returnedVal == -1) {
                       fEndOffset = fOffset;
                       return -1;
                   }
  -                byte [] c = new byte[off];
  -                System.arraycopy(b,0,c,0,off);
  +                b[off] = (byte)returnedVal;
                   return 1;
               }
               if (len < bytesLeft) {
  
  
  
  1.5       +1 -0      xml-xerces/java/src/org/apache/xerces/impl/msg/XMLMessages.properties
  
  Index: XMLMessages.properties
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/msg/XMLMessages.properties,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- XMLMessages.properties	18 Oct 2001 04:42:40 -0000	1.4
  +++ XMLMessages.properties	4 Jan 2002 23:27:49 -0000	1.5
  @@ -111,6 +111,7 @@
           PINotInOneEntity = The processing instruction must be entirely contained within the same parsed entity.
   # 4.3.3 Character Encoding in Entities
           EncodingDeclInvalid = Invalid encoding name \"{0}\".
  +        EncodingByteOrderUnsupported = Given byte order for encoding \"{0}\" is not supported.
   	
   # DTD Messages
   # 2.2 Characters
  
  
  
  1.4       +5 -1      xml-xerces/java/src/org/apache/xerces/util/EncodingMap.java
  
  Index: EncodingMap.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/util/EncodingMap.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- EncodingMap.java	11 Dec 2001 21:57:38 -0000	1.3
  +++ EncodingMap.java	4 Jan 2002 23:27:49 -0000	1.4
  @@ -507,7 +507,7 @@
    * @author Stubs generated by DesignDoc on Wed Jun 07 11:58:44 PDT 2000
    * @author Andy Clark, IBM
    *
  - * @version $Id: EncodingMap.java,v 1.3 2001/12/11 21:57:38 neilg Exp $
  + * @version $Id: EncodingMap.java,v 1.4 2002/01/04 23:27:49 neilg Exp $
    */
   public class EncodingMap {
   
  @@ -647,6 +647,8 @@
           fIANA2JavaMap.put("CP367",        "ASCII");
           fIANA2JavaMap.put("UTF-8",           "UTF8");
           fIANA2JavaMap.put("UTF-16",           "Unicode");
  +        fIANA2JavaMap.put("UTF-16BE",           "UnicodeBig");
  +        fIANA2JavaMap.put("UTF-16LE",           "UnicodeLittle");
   
           // REVISIT:
           //   j:CNS11643 -> EUC-TW?
  @@ -686,6 +688,8 @@
           fJava2IANAMap.put("SJIS",      "SHIFT_JIS");
           fJava2IANAMap.put("UTF8",      "UTF-8");
           fJava2IANAMap.put("Unicode",   "UTF-16");
  +        fJava2IANAMap.put("UnicodeBig",   "UTF-16BE");
  +        fJava2IANAMap.put("UnicodeLittle",   "UTF-16LE");
           fJava2IANAMap.put("JIS0201",  "X0201");
           fJava2IANAMap.put("JIS0208",  "X0208");
           fJava2IANAMap.put("JIS0212",  "ISO-IR-159");
  
  
  
  1.1                  xml-xerces/java/src/org/apache/xerces/impl/io/UCSReader.java
  
  Index: UCSReader.java
  ===================================================================
  /*
   * The Apache Software License, Version 1.1
   *
   *
   * Copyright (c) 2000 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Xerces" and "Apache Software Foundation" must
   *    not be used to endorse or promote products derived from this
   *    software without prior written permission. For written
   *    permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    nor may "Apache" appear in their name, without prior written
   *    permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation and was
   * originally based on software copyright (c) 1999, International
   * Business Machines, Inc., http://www.apache.org.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  package org.apache.xerces.impl.io;
  
  import java.io.InputStream;
  import java.io.IOException;
  import java.io.Reader;
  
  /** 
   * Reader for UCS-2 and UCS-4 encodings.
   * (i.e., encodings from ISO-10646-UCS-(2|4)).
   *
   * @author Neil Graham, IBM
   *
   * @version $Id: UCSReader.java,v 1.1 2002/01/04 23:27:49 neilg Exp $
   */
  public class UCSReader extends Reader {
  
      //
      // Constants
      //
  
      /** Default byte buffer size (8192, larger than that of ASCIIReader
       * since it's reasonable to surmise that the average UCS-4-encoded
       * file should be 4 times as large as the average ASCII-encoded file). 
       */
      public static final int DEFAULT_BUFFER_SIZE = 8192;
  
      public static short UCS2LE = 1;
      public static short UCS2BE = 2;
      public static short UCS4LE = 4;
      public static short UCS4BE = 8;
  
      //
      // Data
      //
  
      /** Input stream. */
      protected InputStream fInputStream;
  
      /** Byte buffer. */
      protected byte[] fBuffer;
  
      // what kind of data we're dealing with
      protected short fEncoding;
  
      //
      // Constructors
      //
  
      /** 
       * Constructs an ASCII reader from the specified input stream 
       * using the default buffer size.  The Endian-ness and whether this is
       * UCS-2 or UCS-4 needs also to be known in advance.
       *
       * @param inputStream The input stream.
       * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
       */
      public UCSReader(InputStream inputStream, short encoding) {
          this(inputStream, DEFAULT_BUFFER_SIZE, encoding);
      } // <init>(InputStream, short)
  
      /** 
       * Constructs an ASCII reader from the specified input stream 
       * and buffer size.  The Endian-ness and whether this is
       * UCS-2 or UCS-4 needs also to be known in advance.
       *
       * @param inputStream The input stream.
       * @param size        The initial buffer size.
       * @param encoding One of UCS2LE, UCS2BE, UCS4LE or UCS4BE.
       */
      public UCSReader(InputStream inputStream, int size, short encoding) {
          fInputStream = inputStream;
          fBuffer = new byte[size];
          fEncoding = encoding;
      } // <init>(InputStream,int,short)
  
      //
      // Reader methods
      //
  
      /**
       * Read a single character.  This method will block until a character is
       * available, an I/O error occurs, or the end of the stream is reached.
       *
       * <p> Subclasses that intend to support efficient single-character input
       * should override this method.
       *
       * @return     The character read, as an integer in the range 0 to 127
       *             (<tt>0x00-0x7f</tt>), or -1 if the end of the stream has
       *             been reached
       *
       * @exception  IOException  If an I/O error occurs
       */
      public int read() throws IOException { 
          int b0 = fInputStream.read() & 0xff;
          if (b0 == 0xff)
              return -1;
          int b1 = fInputStream.read() & 0xff;
          if (b1 == 0xff)
              return -1;
          if(fEncoding >=4) {
              int b2 = fInputStream.read() & 0xff;
              if (b2 == 0xff)
                  return -1;
              int b3 = fInputStream.read() & 0xff;
              if (b3 == 0xff)
                  return -1;
              System.err.println("b0 is " + (b0 & 0xff) + " b1 " + (b1 & 0xff) + " b2 " + (b2 & 0xff) + " b3 " + (b3 & 0xff));
              if (fEncoding == UCS4BE)
                  return (b0<<24)+(b1<<16)+(b2<<8)+b3;
              else
                  return (b3<<24)+(b2<<16)+(b1<<8)+b0;
          } else { // UCS-2
              if (fEncoding == UCS2BE)
                  return (b0<<8)+b1;
              else
                  return (b1<<8)+b0;
          }
      } // read():int
  
      /**
       * Read characters into a portion of an array.  This method will block
       * until some input is available, an I/O error occurs, or the end of the
       * stream is reached.
       *
       * @param      ch     Destination buffer
       * @param      offset Offset at which to start storing characters
       * @param      length Maximum number of characters to read
       *
       * @return     The number of characters read, or -1 if the end of the
       *             stream has been reached
       *
       * @exception  IOException  If an I/O error occurs
       */
      public int read(char ch[], int offset, int length) throws IOException {
          int byteLength = length << ((fEncoding >= 4)?2:1);
          if (byteLength > fBuffer.length) {
              byteLength = fBuffer.length;
          }
          int count = fInputStream.read(fBuffer, 0, byteLength);
          if(count == -1) return -1;
          // try and make count be a multiple of the number of bytes we're looking for
          if(fEncoding >= 4) { // BigEndian
              // this looks ugly, but it avoids an if at any rate...
              int numToRead = (4 - (count & 3) & 3);
              for(int i=0; i<numToRead; i++) {
                  int charRead = fInputStream.read();
                  if(charRead == -1) { // end of input; something likely went wrong!A  Pad buffer with nulls.
                      for (int j = i;j<numToRead; j++)
                          fBuffer[count+j] = 0;
                      break;
                  } else {
                      fBuffer[count+i] = (byte)charRead; 
                  }
              }
              count += numToRead;
          } else {
              int numToRead = count & 1;
              if(numToRead != 0) {
                  count++;
                  int charRead = fInputStream.read();
                  if(charRead == -1) { // end of input; something likely went wrong!A  Pad buffer with nulls.
                      fBuffer[count] = 0;
                  } else {
                      fBuffer[count] = (byte)charRead;
                  }
              }
          }
  
          // now count is a multiple of the right number of bytes
          int numChars = count >> ((fEncoding >= 4)?2:1);
          int curPos = 0;
          for (int i = 0; i < numChars; i++) {
              int b0 = fBuffer[curPos++] & 0xff;
              int b1 = fBuffer[curPos++] & 0xff;
              if(fEncoding >=4) {
                  int b2 = fBuffer[curPos++] & 0xff;
                  int b3 = fBuffer[curPos++] & 0xff;
                  if (fEncoding == UCS4BE)
                      ch[offset+i] = (char)((b0<<24)+(b1<<16)+(b2<<8)+b3);
                  else
                      ch[offset+i] = (char)((b3<<24)+(b2<<16)+(b1<<8)+b0);
              } else { // UCS-2
                  if (fEncoding == UCS2BE)
                      ch[offset+i] = (char)((b0<<8)+b1);
                  else
                      ch[offset+i] = (char)((b1<<8)+b0);
              }
          }
          return numChars;
      } // read(char[],int,int)
  
      /**
       * Skip characters.  This method will block until some characters are
       * available, an I/O error occurs, or the end of the stream is reached.
       *
       * @param  n  The number of characters to skip
       *
       * @return    The number of characters actually skipped
       *
       * @exception  IOException  If an I/O error occurs
       */
      public long skip(long n) throws IOException {
          // charWidth will represent the number of bits to move
          // n leftward to get num of bytes to skip, and then move the result rightward
          // to get num of chars effectively skipped.
          // The trick with &'ing, as with elsewhere in this dcode, is
          // intended to avoid an expensive use of / that might not be optimized
          // away.
          int charWidth = (fEncoding >=4)?2:1;
          long bytesSkipped = fInputStream.skip(n<<charWidth);
          if((bytesSkipped & (charWidth | 1)) == 0) return bytesSkipped >> charWidth;
          return (bytesSkipped >> charWidth) + 1;
      } // skip(long):long
  
      /**
       * Tell whether this stream is ready to be read.
       *
       * @return True if the next read() is guaranteed not to block for input,
       * false otherwise.  Note that returning false does not guarantee that the
       * next read will block.
       *
       * @exception  IOException  If an I/O error occurs
       */
      public boolean ready() throws IOException {
  	    return false;
      } // ready()
  
      /**
       * Tell whether this stream supports the mark() operation.
       */
      public boolean markSupported() {
  	    return fInputStream.markSupported();
      } // markSupported()
  
      /**
       * Mark the present position in the stream.  Subsequent calls to reset()
       * will attempt to reposition the stream to this point.  Not all
       * character-input streams support the mark() operation.
       *
       * @param  readAheadLimit  Limit on the number of characters that may be
       *                         read while still preserving the mark.  After
       *                         reading this many characters, attempting to
       *                         reset the stream may fail.
       *
       * @exception  IOException  If the stream does not support mark(),
       *                          or if some other I/O error occurs
       */
      public void mark(int readAheadLimit) throws IOException {
  	    fInputStream.mark(readAheadLimit);
      } // mark(int)
  
      /**
       * Reset the stream.  If the stream has been marked, then attempt to
       * reposition it at the mark.  If the stream has not been marked, then
       * attempt to reset it in some way appropriate to the particular stream,
       * for example by repositioning it to its starting point.  Not all
       * character-input streams support the reset() operation, and some support
       * reset() without supporting mark().
       *
       * @exception  IOException  If the stream has not been marked,
       *                          or if the mark has been invalidated,
       *                          or if the stream does not support reset(),
       *                          or if some other I/O error occurs
       */
      public void reset() throws IOException {
          fInputStream.reset();
      } // reset()
  
      /**
       * Close the stream.  Once a stream has been closed, further read(),
       * ready(), mark(), or reset() invocations will throw an IOException.
       * Closing a previously-closed stream, however, has no effect.
       *
       * @exception  IOException  If an I/O error occurs
       */
       public void close() throws IOException {
           fInputStream.close();
       } // close()
  
  } // class UCSReader
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org