You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by an...@apache.org on 2001/06/27 07:15:08 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/impl XMLEntityManager.java
andyc 01/06/26 22:15:08
Modified: java/src/org/apache/xerces/impl Tag: xerces_j_2
XMLEntityManager.java
Log:
Fixed bug that prevented the parser from correctly parsing
UTF-8 files created with Microsoft tools that insert a byte
order mark (BOM) at the beginning of the UTF-8 file (even
though it's not needed). Now the BOM is consumed. Yummy!
Revision Changes Path
No revision
No revision
1.1.2.78 +29 -3 xml-xerces/java/src/org/apache/xerces/impl/Attic/XMLEntityManager.java
Index: XMLEntityManager.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/Attic/XMLEntityManager.java,v
retrieving revision 1.1.2.77
retrieving revision 1.1.2.78
diff -u -r1.1.2.77 -r1.1.2.78
--- XMLEntityManager.java 2001/05/18 18:02:33 1.1.2.77
+++ XMLEntityManager.java 2001/06/27 05:15:06 1.1.2.78
@@ -117,7 +117,7 @@
* @author Andy Clark, IBM
* @author Arnaud Le Hors, IBM
*
- * @version $Id: XMLEntityManager.java,v 1.1.2.77 2001/05/18 18:02:33 lehors Exp $
+ * @version $Id: XMLEntityManager.java,v 1.1.2.78 2001/06/27 05:15:06 andyc Exp $
*/
public class XMLEntityManager
implements XMLComponent {
@@ -970,7 +970,20 @@
System.out.println("$$$ wrapping input stream in PushbackInputStream");
}
PushbackInputStream pbstream = new PushbackInputStream(stream, 4);
- pbstream.unread(b4, 0, count);
+ int offset = 0;
+ // Special case UTF-8 files with BOM created by Microsoft
+ // tools. It's more efficient to consume the BOM than make
+ // the reader perform extra checks. -Ac
+ if (count > 2 && encoding.equals("UTF-8")) {
+ int b0 = b4[0] & 0xFF;
+ int b1 = b4[1] & 0xFF;
+ int b2 = b4[2] & 0xFF;
+ if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
+ offset = 3;
+ count -= offset;
+ }
+ }
+ pbstream.unread(b4, offset, count);
// REVISIT: Should save the original input stream instead of
// the pushback input stream so that when we swap out
@@ -1067,12 +1080,25 @@
return "UTF-16";
}
+ // default to UTF-8 if we don't have enough bytes to make a
+ // good determination of the encoding
+ if (count < 3) {
+ return "UTF-8";
+ }
+
+ // UTF-8 with a BOM
+ int b2 = b4[2] & 0xFF;
+ if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
+ return "UTF-8";
+ }
+
+ // default to UTF-8 if we don't have enough bytes to make a
+ // good determination of the encoding
if (count < 4) {
return "UTF-8";
}
// other encodings
- int b2 = b4[2] & 0xFF;
int b3 = b4[3] & 0xFF;
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org