You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by sa...@apache.org on 2003/06/17 19:57:14 UTC

cvs commit: xml-xerces/java/src/org/apache/xerces/impl/io UTF8Reader.java

sandygao    2003/06/17 10:57:14

  Modified:    java/src/org/apache/xerces/impl/io UTF8Reader.java
  Log:
  A performance improvement for UTF8 scanning.
  
  Revision  Changes    Path
  1.6       +52 -39    xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java
  
  Index: UTF8Reader.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- UTF8Reader.java	23 Jul 2002 20:53:19 -0000	1.5
  +++ UTF8Reader.java	17 Jun 2003 17:57:14 -0000	1.6
  @@ -113,8 +113,8 @@
       // Constructors
       //
   
  -    /** 
  -     * Constructs a UTF-8 reader from the specified input stream 
  +    /**
  +     * Constructs a UTF-8 reader from the specified input stream
        * using the default buffer size.  Primarily for testing.
        *
        * @param inputStream The input stream.
  @@ -123,8 +123,8 @@
           this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
       } // <init>(InputStream, MessageFormatter)
   
  -    /** 
  -     * Constructs a UTF-8 reader from the specified input stream 
  +    /**
  +     * Constructs a UTF-8 reader from the specified input stream
        * using the default buffer size and the given MessageFormatter.
        *
        * @param inputStream The input stream.
  @@ -136,8 +136,8 @@
           this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
       } // <init>(InputStream, MessageFormatter)
   
  -    /** 
  -     * Constructs a UTF-8 reader from the specified input stream, 
  +    /**
  +     * Constructs a UTF-8 reader from the specified input stream,
        * buffer size and MessageFormatter.
        *
        * @param inputStream The input stream.
  @@ -180,7 +180,7 @@
               int index = 0;
   
               // get first byte
  -            int b0 = index == fOffset 
  +            int b0 = index == fOffset
                      ? fInputStream.read() : fBuffer[index++] & 0x00FF;
               if (b0 == -1) {
                   return -1;
  @@ -195,7 +195,7 @@
               // UTF-8:   [110y yyyy] [10xx xxxx]
               // Unicode: [0000 0yyy] [yyxx xxxx]
               else if ((b0 & 0xE0) == 0xC0) {
  -                int b1 = index == fOffset 
  +                int b1 = index == fOffset
                          ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b1 == -1) {
                       expectedByte(2, 2);
  @@ -217,7 +217,7 @@
                   if ((b1 & 0xC0) != 0x80) {
                       invalidByte(2, 3, b1);
                   }
  -                int b2 = index == fOffset 
  +                int b2 = index == fOffset
                          ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b2 == -1) {
                       expectedByte(3, 3);
  @@ -234,7 +234,7 @@
               //          [1101 11yy] [yyxx xxxx] (low surrogate)
               //          * uuuuu = wwww + 1
               else if ((b0 & 0xF8) == 0xF0) {
  -                int b1 = index == fOffset 
  +                int b1 = index == fOffset
                          ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b1 == -1) {
                       expectedByte(2, 4);
  @@ -242,7 +242,7 @@
                   if ((b1 & 0xC0) != 0x80) {
                       invalidByte(2, 3, b1);
                   }
  -                int b2 = index == fOffset 
  +                int b2 = index == fOffset
                          ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b2 == -1) {
                       expectedByte(3, 4);
  @@ -250,7 +250,7 @@
                   if ((b2 & 0xC0) != 0x80) {
                       invalidByte(3, 3, b2);
                   }
  -                int b3 = index == fOffset 
  +                int b3 = index == fOffset
                          ? fInputStream.read() : fBuffer[index++] & 0x00FF;
                   if (b3 == -1) {
                       expectedByte(4, 4);
  @@ -263,8 +263,8 @@
                       invalidSurrogate(uuuuu);
                   }
                   int wwww = uuuuu - 1;
  -                int hs = 0xD800 | 
  -                         ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) | 
  +                int hs = 0xD800 |
  +                         ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
                            ((b2 >> 4) & 0x0003);
                   int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
                   c = hs;
  @@ -334,7 +334,7 @@
           // skip read; last character was in error
           // NOTE: Having an offset value other than zero means that there was
           //       an error in the last character read. In this case, we have
  -        //       skipped the read so we don't consume any bytes past the 
  +        //       skipped the read so we don't consume any bytes past the
           //       error. By signalling the error on the next block read we
           //       allow the method to return the most valid characters that
           //       it can on the previous block read. -Ac
  @@ -345,22 +345,35 @@
   
           // convert bytes to characters
           final int total = count;
  -        for (int in = 0; in < total; in++) {
  -            int b0 = fBuffer[in] & 0x00FF;
  +        int in;
  +        byte byte1;
  +        final byte byte0 = 0;
  +        for (in = 0; in < total; in++) {
  +            byte1 = fBuffer[in];
  +            if (byte1 >= byte0) {
  +                ch[out++] = (char)byte1;
  +            }
  +            else   {
  +                break;
  +            }
  +        }
  +        for ( ; in < total; in++) {
  +            byte1 = fBuffer[in];
   
               // UTF-8:   [0xxx xxxx]
               // Unicode: [0000 0000] [0xxx xxxx]
  -            if (b0 < 0x80) {
  -                ch[out++] = (char)b0;
  +            if (byte1 >= byte0) {
  +                ch[out++] = (char)byte1;
                   continue;
               }
   
               // UTF-8:   [110y yyyy] [10xx xxxx]
               // Unicode: [0000 0yyy] [yyxx xxxx]
  +            int b0 = byte1 & 0x0FF;
               if ((b0 & 0xE0) == 0xC0) {
                   int b1 = -1;
  -                if (++in < total) { 
  -                    b1 = fBuffer[in] & 0x00FF; 
  +                if (++in < total) {
  +                    b1 = fBuffer[in] & 0x00FF;
                   }
                   else {
                       b1 = fInputStream.read();
  @@ -393,8 +406,8 @@
               // Unicode: [zzzz yyyy] [yyxx xxxx]
               if ((b0 & 0xF0) == 0xE0) {
                   int b1 = -1;
  -                if (++in < total) { 
  -                    b1 = fBuffer[in] & 0x00FF; 
  +                if (++in < total) {
  +                    b1 = fBuffer[in] & 0x00FF;
                   }
                   else {
                       b1 = fInputStream.read();
  @@ -418,8 +431,8 @@
                       invalidByte(2, 3, b1);
                   }
                   int b2 = -1;
  -                if (++in < total) { 
  -                    b2 = fBuffer[in] & 0x00FF; 
  +                if (++in < total) {
  +                    b2 = fBuffer[in] & 0x00FF;
                   }
                   else {
                       b2 = fInputStream.read();
  @@ -457,8 +470,8 @@
               //          * uuuuu = wwww + 1
               if ((b0 & 0xF8) == 0xF0) {
                   int b1 = -1;
  -                if (++in < total) { 
  -                    b1 = fBuffer[in] & 0x00FF; 
  +                if (++in < total) {
  +                    b1 = fBuffer[in] & 0x00FF;
                   }
                   else {
                       b1 = fInputStream.read();
  @@ -482,8 +495,8 @@
                       invalidByte(2, 4, b1);
                   }
                   int b2 = -1;
  -                if (++in < total) { 
  -                    b2 = fBuffer[in] & 0x00FF; 
  +                if (++in < total) {
  +                    b2 = fBuffer[in] & 0x00FF;
                   }
                   else {
                       b2 = fInputStream.read();
  @@ -509,8 +522,8 @@
                       invalidByte(3, 4, b2);
                   }
                   int b3 = -1;
  -                if (++in < total) { 
  -                    b3 = fBuffer[in] & 0x00FF; 
  +                if (++in < total) {
  +                    b3 = fBuffer[in] & 0x00FF;
                   }
                   else {
                       b3 = fInputStream.read();
  @@ -614,14 +627,14 @@
        * @exception  IOException  If an I/O error occurs
        */
       public boolean ready() throws IOException {
  -	    return false;
  +        return false;
       } // ready()
   
       /**
        * Tell whether this stream supports the mark() operation.
        */
       public boolean markSupported() {
  -	    return false;
  +        return false;
       } // markSupported()
   
       /**
  @@ -638,7 +651,7 @@
        *                          or if some other I/O error occurs
        */
       public void mark(int readAheadLimit) throws IOException {
  -	    throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
  +        throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
       } // mark(int)
   
       /**
  @@ -685,10 +698,10 @@
       } // expectedByte(int,int,int)
   
       /** Throws an exception for invalid byte. */
  -    private void invalidByte(int position, int count, int c) 
  +    private void invalidByte(int position, int count, int c)
           throws UTFDataFormatException {
   
  -        String message = fFormatter.formatMessage(fLocale, "InvalidByte", 
  +        String message = fFormatter.formatMessage(fLocale, "InvalidByte",
                   new Object [] {Integer.toString(position), Integer.toString(count)});
           throw new UTFDataFormatException(message);
   
  @@ -696,11 +709,11 @@
   
       /** Throws an exception for invalid surrogate bits. */
       private void invalidSurrogate(int uuuuu) throws UTFDataFormatException {
  -        
  +
           StringBuffer str = new StringBuffer();
           str.append("high surrogate bits in UTF-8 sequence must not exceed 0x10 but found 0x");
   
  -        String message = fFormatter.formatMessage(fLocale, "InvalidHighSurrogate", 
  +        String message = fFormatter.formatMessage(fLocale, "InvalidHighSurrogate",
                   new Object[] {Integer.toHexString(uuuuu)});
           throw new UTFDataFormatException(message);
   
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org