You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by an...@apache.org on 2001/06/27 07:15:08 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/impl XMLEntityManager.java

andyc       01/06/26 22:15:08

  Modified:    java/src/org/apache/xerces/impl Tag: xerces_j_2
                        XMLEntityManager.java
  Log:
  Fixed bug that prevented the parser from correctly parsing
  UTF-8 files created with Microsoft tools that insert a byte
  order mark (BOM) at the beginning of the UTF-8 file (even
  though it's not needed). Now the BOM is consumed. Yummy!
  
  Revision  Changes    Path
  No                   revision
  
  
  No                   revision
  
  
  1.1.2.78  +29 -3     xml-xerces/java/src/org/apache/xerces/impl/Attic/XMLEntityManager.java
  
  Index: XMLEntityManager.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/Attic/XMLEntityManager.java,v
  retrieving revision 1.1.2.77
  retrieving revision 1.1.2.78
  diff -u -r1.1.2.77 -r1.1.2.78
  --- XMLEntityManager.java	2001/05/18 18:02:33	1.1.2.77
  +++ XMLEntityManager.java	2001/06/27 05:15:06	1.1.2.78
  @@ -117,7 +117,7 @@
    * @author Andy Clark, IBM
    * @author Arnaud  Le Hors, IBM
    *
  - * @version $Id: XMLEntityManager.java,v 1.1.2.77 2001/05/18 18:02:33 lehors Exp $
  + * @version $Id: XMLEntityManager.java,v 1.1.2.78 2001/06/27 05:15:06 andyc Exp $
    */
   public class XMLEntityManager
       implements XMLComponent {
  @@ -970,7 +970,20 @@
                           System.out.println("$$$ wrapping input stream in PushbackInputStream");
                       }
                       PushbackInputStream pbstream = new PushbackInputStream(stream, 4);
  -                    pbstream.unread(b4, 0, count);
  +                    int offset = 0;
  +                    // Special case UTF-8 files with BOM created by Microsoft
  +                    // tools. It's more efficient to consume the BOM than make
  +                    // the reader perform extra checks. -Ac
  +                    if (count > 2 && encoding.equals("UTF-8")) {
  +                        int b0 = b4[0] & 0xFF;
  +                        int b1 = b4[1] & 0xFF;
  +                        int b2 = b4[2] & 0xFF;
  +                        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
  +                            offset = 3;
  +                            count -= offset;
  +                        }
  +                    }
  +                    pbstream.unread(b4, offset, count);
   
                       // REVISIT: Should save the original input stream instead of
                       //          the pushback input stream so that when we swap out
  @@ -1067,12 +1080,25 @@
               return "UTF-16";
           }
   
  +        // default to UTF-8 if we don't have enough bytes to make a
  +        // good determination of the encoding
  +        if (count < 3) {
  +            return "UTF-8";
  +        }
  +
  +        // UTF-8 with a BOM
  +        int b2 = b4[2] & 0xFF;
  +        if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
  +            return "UTF-8";
  +        }
  +
  +        // default to UTF-8 if we don't have enough bytes to make a
  +        // good determination of the encoding
           if (count < 4) {
               return "UTF-8";
           }
   
           // other encodings
  -        int b2 = b4[2] & 0xFF;
           int b3 = b4[3] & 0xFF;
           if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
               // UCS-4, big endian (1234)
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org