You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by an...@locus.apache.org on 2000/11/04 00:54:13 UTC

cvs commit: xml-xerces/java/src/org/apache/xerces/impl/io UTF8Reader.java

andyc       00/11/03 15:54:12

  Modified:    java/src/org/apache/xerces/impl/io Tag: xerces_j_2
                        UTF8Reader.java
  Log:
  Bug fixes.
  
  Revision  Changes    Path
  No                   revision
  
  
  No                   revision
  
  
  1.1.2.8   +107 -52   xml-xerces/java/src/org/apache/xerces/impl/io/Attic/UTF8Reader.java
  
  Index: UTF8Reader.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/io/Attic/UTF8Reader.java,v
  retrieving revision 1.1.2.7
  retrieving revision 1.1.2.8
  diff -u -r1.1.2.7 -r1.1.2.8
  --- UTF8Reader.java	2000/10/27 17:20:23	1.1.2.7
  +++ UTF8Reader.java	2000/11/03 23:54:11	1.1.2.8
  @@ -65,7 +65,7 @@
   /**
    * @author Andy Clark, IBM
    *
  - * @version $Id: UTF8Reader.java,v 1.1.2.7 2000/10/27 17:20:23 andyc Exp $
  + * @version $Id: UTF8Reader.java,v 1.1.2.8 2000/11/03 23:54:11 andyc Exp $
    */
   public class UTF8Reader
       extends Reader {
  @@ -77,6 +77,14 @@
       /** Default byte buffer size (2048). */
       public static final int DEFAULT_BUFFER_SIZE = 2048;
   
  +    // debugging
  +
  +    /** Debug read. */
  +    private static final boolean DEBUG_READ = false;
  +
  +    /** Debug buffer boundary. */
  +    private static final boolean DEBUG_BUFFER_BOUNDARY = false;
  +
       //
       // Data
       //
  @@ -87,12 +95,12 @@
       /** Byte buffer. */
       protected byte[] fBuffer;
   
  +    /** Offset into buffer. */
  +    protected int fOffset;
  +
       /** Surrogate character. */
       private int fSurrogate = -1;
   
  -    /** Buffer offset to start reading from. */
  -    private int fOffset;
  -
       //
       // Constructors
       //
  @@ -138,9 +146,16 @@
        */
       public int read() throws IOException {
   
  +        // decode character
           int c = fSurrogate;
           if (fSurrogate == -1) {
  -            int b0 = fInputStream.read();
  +            // NOTE: We use the index into the buffer if there are remaining
  +            //       bytes from the last block read. -Ac
  +            int index = 0;
  +
  +            // get first byte
  +            int b0 = index == fOffset 
  +                   ? fInputStream.read() : fBuffer[index++] & 0x00FF;
               if (b0 == -1) {
                   return -1;
               }
  @@ -154,12 +169,13 @@
               // UTF-8:   [110y yyyy] [10xx xxxx]
               // Unicode: [0000 0yyy] [yyxx xxxx]
               else if ((b0 & 0xE0) == 0xC0) {
  -                int b1 = fInputStream.read();
  +                int b1 = index == fOffset 
  +                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b1 == -1) {
  -                    expectedByte(2, 2);
  +                    expectedByte(2, 2, index != fOffset ? 0 : index - 1);
                   }
                   if ((b1 & 0xC0) != 0x80) {
  -                    invalidByte(2, 2, b1);
  +                    invalidByte(2, 2, b1, index != fOffset ? 0 : index - 1);
                   }
                   c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
               }
  @@ -167,19 +183,21 @@
               // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]
               // Unicode: [zzzz yyyy] [yyxx xxxx]
               else if ((b0 & 0xF0) == 0xE0) {
  -                int b1 = fInputStream.read();
  +                int b1 = index == fOffset
  +                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b1 == -1) {
  -                    expectedByte(2, 3);
  +                    expectedByte(2, 3, index != fOffset ? 0 : index - 1);
                   }
                   if ((b1 & 0xC0) != 0x80) {
  -                    invalidByte(2, 3, b1);
  +                    invalidByte(2, 3, b1, index != fOffset ? 0 : index - 1);
                   }
  -                int b2 = fInputStream.read();
  +                int b2 = index == fOffset 
  +                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b2 == -1) {
  -                    expectedByte(3, 3);
  +                    expectedByte(3, 3, index != fOffset ? 0 : index - 1);
                   }
                   if ((b2 & 0xC0) != 0x80) {
  -                    invalidByte(3, 3, b2);
  +                    invalidByte(3, 3, b2, index != fOffset ? 0 : index - 1);
                   }
                   c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
                       (b2 & 0x003F);
  @@ -190,26 +208,29 @@
               //          [1101 11yy] [yyxx xxxx] (low surrogate)
               //          * uuuuu = wwww + 1
               else if ((b0 & 0xF8) == 0xF0) {
  -                int b1 = fInputStream.read();
  +                int b1 = index == fOffset 
  +                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b1 == -1) {
  -                    expectedByte(2, 4);
  +                    expectedByte(2, 4, index != fOffset ? 0 : index - 1);
                   }
                   if ((b1 & 0xC0) != 0x80) {
  -                    invalidByte(2, 3, b1);
  +                    invalidByte(2, 3, b1, index != fOffset ? 0 : index - 1);
                   }
  -                int b2 = fInputStream.read();
  +                int b2 = index == fOffset 
  +                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b2 == -1) {
  -                    expectedByte(3, 4);
  +                    expectedByte(3, 4, index != fOffset ? 0 : index - 1);
                   }
                   if ((b2 & 0xC0) != 0x80) {
  -                    invalidByte(3, 3, b2);
  +                    invalidByte(3, 3, b2, index != fOffset ? 0 : index - 1);
                   }
  -                int b3 = fInputStream.read();
  +                int b3 = index == fOffset 
  +                       ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b3 == -1) {
  -                    expectedByte(4, 4);
  +                    expectedByte(4, 4, index != fOffset ? 0 : index - 1);
                   }
                   if ((b3 & 0xC0) != 0x80) {
  -                    invalidByte(4, 4, b3);
  +                    invalidByte(4, 4, b3, index != fOffset ? 0 : index - 1);
                   }
                   int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
                   int wwww = uuuuu - 1;
  @@ -223,12 +244,19 @@
   
               // error
               else {
  -                invalidByte(1, 1, b0);
  +                invalidByte(1, 1, b0, index != fOffset ? 0 : index - 1);
               }
           }
  +
  +        // use surrogate
           else {
               fSurrogate = -1;
           }
  +
  +        // return character
  +        if (DEBUG_READ) {
  +            System.out.println("read(): 0x"+Integer.toHexString(c));
  +        }
           return c;
   
       } // read():int
  @@ -265,16 +293,17 @@
           int count = fInputStream.read(fBuffer, fOffset, length);
           if (count == -1) {
               if (fOffset > 0) {
  -                expectedByte(fOffset, 4 - fOffset);
  +                expectedByte(fOffset, 4 - fOffset, 0);
               }
               return -1;
           }
  +        count += fOffset;
           fOffset = 0;
   
           // convert bytes to characters
           final int total = count;
           for (int in = 0, out = offset; in < total; in++) {
  -            int b0 = fBuffer[in];
  +            int b0 = fBuffer[in] & 0x00FF;
   
               // UTF-8:   [0xxx xxxx]
               // Unicode: [0000 0000] [0xxx xxxx]
  @@ -287,14 +316,17 @@
               // Unicode: [0000 0yyy] [yyxx xxxx]
               if ((b0 & 0xE0) == 0xC0) {
                   if (++in == total) {
  +                    if (DEBUG_BUFFER_BOUNDARY) {
  +                        System.out.println("*** buffer boundary (1,2) 0x"+Integer.toHexString(b0));
  +                    }
                       fBuffer[0] = (byte)b0;
                       fOffset = 1;
                       count -= fOffset;
                       break;
                   }
  -                int b1 = fBuffer[in];
  +                int b1 = fBuffer[in] & 0x00FF;
                   if ((b1 & 0xC0) != 0x80) {
  -                    invalidByte(2, 2, b1);
  +                    invalidByte(2, 2, b1, in);
                   }
                   int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
                   ch[out++] = (char)c;
  @@ -306,25 +338,31 @@
               // Unicode: [zzzz yyyy] [yyxx xxxx]
               if ((b0 & 0xF0) == 0xE0) {
                   if (++in == total) {
  +                    if (DEBUG_BUFFER_BOUNDARY) {
  +                        System.out.println("*** buffer boundary (1,3) 0x"+Integer.toHexString(b0));
  +                    }
                       fBuffer[0] = (byte)b0;
                       fOffset = 1;
                       count -= fOffset;
                       break;
                   }
  -                int b1 = fBuffer[in];
  +                int b1 = fBuffer[in] & 0x00FF;
                   if ((b1 & 0xC0) != 0x80) {
  -                    invalidByte(2, 3, b1);
  +                    invalidByte(2, 3, b1, in);
                   }
                   if (++in == total) {
  +                    if (DEBUG_BUFFER_BOUNDARY) {
  +                        System.out.println("*** buffer boundary (2,3) 0x"+Integer.toHexString(b0)+" 0x"+Integer.toHexString(b1));
  +                    }
                       fBuffer[0] = (byte)b0;
                       fBuffer[1] = (byte)b1;
                       fOffset = 2;
                       count -= fOffset;
                       break;
                   }
  -                int b2 = fBuffer[in];
  +                int b2 = fBuffer[in] & 0x00FF;
                   if ((b2 & 0xC0) != 0x80) {
  -                    invalidByte(3, 3, b2);
  +                    invalidByte(3, 3, b2, in);
                   }
                   int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
                           (b2 & 0x003F);
  @@ -339,27 +377,36 @@
               //          * uuuuu = wwww + 1
               if ((b0 & 0xF8) == 0xF0) {
                   if (++in == total) {
  +                    if (DEBUG_BUFFER_BOUNDARY) {
  +                        System.out.println("*** buffer boundary (1,4) 0x"+Integer.toHexString(b0));
  +                    }
                       fBuffer[0] = (byte)b0;
                       fOffset = 1;
                       count -= fOffset;
                       break;
                   }
  -                int b1 = fBuffer[in];
  +                int b1 = fBuffer[in] & 0x00FF;
                   if ((b1 & 0xC0) != 0x80) {
  -                    invalidByte(2, 4, b1);
  +                    invalidByte(2, 4, b1, in);
                   }
                   if (++in == total) {
  +                    if (DEBUG_BUFFER_BOUNDARY) {
  +                        System.out.println("*** buffer boundary (2,4) 0x"+Integer.toHexString(b0)+" 0x"+Integer.toHexString(b1));
  +                    }
                       fBuffer[0] = (byte)b0;
                       fBuffer[1] = (byte)b1;
                       fOffset = 2;
                       count -= fOffset;
                       break;
                   }
  -                int b2 = fBuffer[in];
  +                int b2 = fBuffer[in] & 0x00FF;
                   if ((b2 & 0xC0) != 0x80) {
  -                    invalidByte(3, 4, b2);
  +                    invalidByte(3, 4, b2, in);
                   }
                   if (++in == total) {
  +                    if (DEBUG_BUFFER_BOUNDARY) {
  +                        System.out.println("*** buffer boundary (3,4) 0x"+Integer.toHexString(b0)+" 0x"+Integer.toHexString(b1)+" 0x"+Integer.toHexString(2));
  +                    }
                       fBuffer[0] = (byte)b0;
                       fBuffer[1] = (byte)b1;
                       fBuffer[2] = (byte)b2;
  @@ -367,25 +414,20 @@
                       count -= fOffset;
                       break;
                   }
  -                int b3 = fBuffer[in];
  +                int b3 = fBuffer[in] & 0x00FF;
                   if ((b3 & 0xC0) != 0x80) {
  -                    invalidByte(4, 4, b3);
  +                    invalidByte(4, 4, b3, in);
                   }
  -                if (out + 2 >= offset + length) {
  -                    fBuffer[0] = (byte)b0;
  -                    fBuffer[1] = (byte)b1;
  -                    fBuffer[2] = (byte)b2;
  -                    fBuffer[3] = (byte)b3;
  -                    fOffset = 4;
  -                    count -= fOffset;
  -                    break;
  -                }
  +
  +                // decode bytes into surrogate characters
                   int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
                   int wwww = uuuuu - 1;
                   int hs = 0xD800 | 
                            ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | 
                            ((b2 >> 4) & 0x0003);
                   int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
  +
  +                // set characters; save low surrogate if doesn't fit
                   ch[out++] = (char)hs;
                   if (out < offset + length) {
                       ch[out++] = (char)ls;
  @@ -398,10 +440,13 @@
               }
   
               // error
  -            invalidByte(1, 1, b0);
  +            invalidByte(1, 1, b0, in);
           }
   
           // return number of characters converted
  +        if (DEBUG_READ) {
  +            System.out.println("read(char[],"+offset+','+length+"): count="+count);
  +        }
           return count;
   
       } // read(char[],int,int)
  @@ -507,7 +552,7 @@
       //
   
       /** Throws an exception for expected byte. */
  -    private void expectedByte(int position, int count)
  +    private void expectedByte(int position, int count, int bufferPos)
           throws UTFDataFormatException {
   
           StringBuffer str = new StringBuffer();
  @@ -516,14 +561,19 @@
           str.append(" of ");
           str.append(count);
           str.append("-byte UTF-8 sequence");
  +        if (DEBUG_BUFFER_BOUNDARY) {
  +            str.append(" [");
  +            str.append(bufferPos);
  +            str.append(']');
  +        }
   
           String message = str.toString();
           throw new UTFDataFormatException(message);
   
  -    } // expectedByte(int,int)
  +    } // expectedByte(int,int,int)
   
       /** Throws an exception for invalid byte. */
  -    private void invalidByte(int position, int count, int c) 
  +    private void invalidByte(int position, int count, int c, int bufferPos) 
           throws UTFDataFormatException {
   
           StringBuffer str = new StringBuffer();
  @@ -534,10 +584,15 @@
           str.append("-byte UTF-8 sequence (0x");
           str.append(Integer.toHexString(c));
           str.append(')');
  +        if (DEBUG_BUFFER_BOUNDARY) {
  +            str.append(" [");
  +            str.append(bufferPos);
  +            str.append(']');
  +        }
   
           String message = str.toString();
           throw new UTFDataFormatException(message);
   
  -    } // invalidByte(int,int,int)
  +    } // invalidByte(int,int,int,int)
   
   } // class UTF8Reader