You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by an...@locus.apache.org on 2000/10/20 00:31:39 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/impl/io ASCIIReader.java UTF8Reader.java
andyc 00/10/19 15:31:39
Modified: java/src/org/apache/xerces/impl Tag: xerces_j_2
XMLEntityManager.java
Added: java/src/org/apache/xerces/impl/io Tag: xerces_j_2
ASCIIReader.java UTF8Reader.java
Log:
Wrote optimized US-ASCII and UTF-8 readers and plugged them into
the parser. They're much faster than the equivalent ones provided
with Java.
Revision Changes Path
No revision
No revision
1.1.2.28 +155 -118 xml-xerces/java/src/org/apache/xerces/impl/Attic/XMLEntityManager.java
Index: XMLEntityManager.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/Attic/XMLEntityManager.java,v
retrieving revision 1.1.2.27
retrieving revision 1.1.2.28
diff -u -r1.1.2.27 -r1.1.2.28
--- XMLEntityManager.java 2000/10/19 04:16:50 1.1.2.27
+++ XMLEntityManager.java 2000/10/19 22:31:37 1.1.2.28
@@ -72,6 +72,8 @@
import java.util.Stack;
import org.apache.xerces.impl.XMLErrorReporter;
+import org.apache.xerces.impl.io.ASCIIReader;
+import org.apache.xerces.impl.io.UTF8Reader;
import org.apache.xerces.impl.msg.XMLMessageFormatter;
import org.apache.xerces.util.EncodingMap;
@@ -112,7 +114,7 @@
* @author Stubs generated by DesignDoc on Mon Sep 18 18:23:16 PDT 2000
* @author Andy Clark, IBM
*
- * @version $Id: XMLEntityManager.java,v 1.1.2.27 2000/10/19 04:16:50 andyc Exp $
+ * @version $Id: XMLEntityManager.java,v 1.1.2.28 2000/10/19 22:31:37 andyc Exp $
*/
public class XMLEntityManager
implements XMLComponent {
@@ -598,107 +600,10 @@
} // setProperty(String,Object)
//
- // Protected methods
+ // Public static methods
//
/**
- * Starts an entity.
- */
- protected void startEntity(String name, XMLInputSource xmlInputSource,
- boolean literal) throws IOException, SAXException {
-
- // get information
- final String publicId = xmlInputSource.getPublicId();
- final String systemId = xmlInputSource.getSystemId();
- String encoding = xmlInputSource.getEncoding();
-
- // create reader
- InputStream stream = null;
- Reader reader = xmlInputSource.getCharacterStream();
- if (reader == null) {
- stream = xmlInputSource.getByteStream();
- if (stream == null) {
- String expandedSystemId = xmlInputSource.getExpandedSystemId();
- if (expandedSystemId == null) {
- final String baseSystemId = xmlInputSource.getBaseSystemId();
- expandedSystemId = expandSystemId(systemId, baseSystemId);
- xmlInputSource.setExpandedSystemId(expandedSystemId);
- }
- stream = new URL(expandedSystemId).openStream();
- }
-
- // perform auto-detect of encoding
- if (encoding == null) {
- // read first four bytes and determine encoding
- final byte[] b4 = new byte[4];
- int count = stream.read(b4, 0, 4);
- encoding = getJavaEncodingName(b4, count);
-
- // push back the characters we read
- PushbackInputStream pbstream = new PushbackInputStream(stream, 4);
- pbstream.unread(b4, 0, count);
- stream = pbstream;
-
- // REVISIT: Should save the original input stream instead of
- // the pushback input stream so that when we swap out
- // the OneCharReader, we don't still have a method
- // indirection to get at the underlying bytes. -Ac
- }
-
- // create reader from input stream
- // REVISIT: We can use customized readers here. -Ac
- reader = new InputStreamReader(stream, encoding);
-
- // REVISIT: Activate this reader once I've updated the
- // entity scanner. -Ac
- //reader = new OneCharReader(reader);
- }
-
- // push entity on stack
- if (fCurrentEntity != null) {
- fEntityStack.push(fCurrentEntity);
- }
- fCurrentEntity = new ScannedEntity(name, publicId, systemId,
- stream, reader, encoding,
- literal);
-
- // call handler
- if (fEntityHandler != null) {
- String ianaEncoding = encoding != null
- ? EncodingMap.getJava2IANAMapping(encoding)
- : null;
- fEntityHandler.startEntity(name, publicId, systemId, ianaEncoding);
- }
-
- } // startEntity(String,XMLInputSource)
-
- /**
- * Ends an entity.
- */
- protected void endEntity() throws SAXException {
-
- // call handler
- if (DEBUG_PRINT) {
- System.out.println("(endEntity: ");
- print();
- System.out.println();
- }
- if (fEntityHandler != null) {
- fEntityHandler.endEntity(fCurrentEntity.name);
- }
-
- // pop stack
- fCurrentEntity = fEntityStack.size() > 0
- ? (ScannedEntity)fEntityStack.pop() : null;
- if (DEBUG_PRINT) {
- System.out.println(")endEntity: ");
- print();
- System.out.println();
- }
-
- } // endEntity()
-
- /**
* Expands a system id and returns the system id as a URI, if
* it can be expanded. A return value of null means that the
* identifier is already expanded. An exception thrown
@@ -800,22 +705,113 @@
} // expandSystemId(String,String):String
//
- // Private methods
+ // Protected methods
//
/**
- * Returns the Java encoding name that is auto-detected from
+ * Starts an entity.
+ */
+ protected void startEntity(String name, XMLInputSource xmlInputSource,
+ boolean literal) throws IOException, SAXException {
+
+ // get information
+ final String publicId = xmlInputSource.getPublicId();
+ final String systemId = xmlInputSource.getSystemId();
+ String encoding = xmlInputSource.getEncoding();
+
+ // create reader
+ InputStream stream = null;
+ Reader reader = xmlInputSource.getCharacterStream();
+ if (reader == null) {
+ stream = xmlInputSource.getByteStream();
+ if (stream == null) {
+ String expandedSystemId = xmlInputSource.getExpandedSystemId();
+ if (expandedSystemId == null) {
+ final String baseSystemId = xmlInputSource.getBaseSystemId();
+ expandedSystemId = expandSystemId(systemId, baseSystemId);
+ xmlInputSource.setExpandedSystemId(expandedSystemId);
+ }
+ stream = new URL(expandedSystemId).openStream();
+ }
+
+ // perform auto-detect of encoding
+ if (encoding == null) {
+ // read first four bytes and determine encoding
+ final byte[] b4 = new byte[4];
+ int count = stream.read(b4, 0, 4);
+ encoding = getEncodingName(b4, count);
+
+ // push back the characters we read
+ PushbackInputStream pbstream = new PushbackInputStream(stream, 4);
+ pbstream.unread(b4, 0, count);
+ stream = pbstream;
+
+ // REVISIT: Should save the original input stream instead of
+ // the pushback input stream so that when we swap out
+ // the OneCharReader, we don't still have a method
+ // indirection to get at the underlying bytes. -Ac
+ }
+
+ // create reader from input stream
+ reader = createReader(stream, encoding);
+
+ // REVISIT: Activate this reader once I've updated the
+ // entity scanner. -Ac
+ //reader = new OneCharReader(reader);
+ }
+
+ // push entity on stack
+ if (fCurrentEntity != null) {
+ fEntityStack.push(fCurrentEntity);
+ }
+ fCurrentEntity = new ScannedEntity(name, publicId, systemId,
+ stream, reader, encoding,
+ literal);
+
+ // call handler
+ if (fEntityHandler != null) {
+ fEntityHandler.startEntity(name, publicId, systemId, encoding);
+ }
+
+ } // startEntity(String,XMLInputSource)
+
+ /**
+ * Ends an entity.
+ */
+ protected void endEntity() throws SAXException {
+
+ // call handler
+ if (DEBUG_PRINT) {
+ System.out.println("(endEntity: ");
+ print();
+ System.out.println();
+ }
+ if (fEntityHandler != null) {
+ fEntityHandler.endEntity(fCurrentEntity.name);
+ }
+
+ // pop stack
+ fCurrentEntity = fEntityStack.size() > 0
+ ? (ScannedEntity)fEntityStack.pop() : null;
+ if (DEBUG_PRINT) {
+ System.out.println(")endEntity: ");
+ print();
+ System.out.println();
+ }
+
+ } // endEntity()
+
+ /**
+ * Returns the IANA encoding name that is auto-detected from
* the bytes specified.
*
* @param b4 The first four bytes of the input.
* @param count The number of bytes actually read.
- *
- * @returns The Java encoding name.
*/
- private static String getJavaEncodingName(byte[] b4, int count) {
+ protected String getEncodingName(byte[] b4, int count) {
if (count < 2) {
- return "UTF8";
+ return "UTF-8";
}
// UTF-16, with BOM
@@ -823,15 +819,15 @@
byte b1 = b4[1];
if (b0 == 0xFE && b1 == 0xFF) {
// UTF-16, big-endian
- return "UnicodeBig";
+ return "UTF-16";
}
if (b0 == 0xFF && b1 == 0xFE) {
// UTF-16, little-endian
- return "UnicodeLittle";
+ return "UTF-16";
}
if (count < 4) {
- return "UTF8";
+ return "UTF-8";
}
// other encodings
@@ -840,50 +836,87 @@
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
// UCS-4, big endian (1234)
// REVISIT: What should this be?
- return "Unicode";
+ return "UCS-4";
}
if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
// UCS-4, little endian (4321)
// REVISIT: What should this be?
- return "Unicode";
+ return "UCS-4";
}
if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
// UCS-4, unusual octet order (2143)
// REVISIT: What should this be?
- return "Unicode";
+ return "UCS-4";
}
if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
// UCS-4, unusual octect order (3412)
// REVISIT: What should this be?
- return "Unicode";
+ return "UCS-4";
}
if (b1 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
// UTF-16, big-endian, no BOM
// REVISIT: What should this be?
- return "Unicode";
+ return "UCS-4";
}
if (b1 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
// UTF-16, little-endian, no BOM
- return "UnicodeLittle";
+ return "UCS-4";
}
if (b1 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
// EBCDIC
- return "DBCS_EBCDIC";
+ return "EBCDIC";
}
// default encoding
- return "UTF8";
+ return "UTF-8";
- } // getJavaEncodingName(byte[],int):String
+ } // getEncodingName(byte[],int):String
/**
+ * Creates a reader capable of reading the given input stream in
+ * the specified encoding.
+ *
+ * @param inputStream The input stream.
+ * @param ianaEncoding The IANA encoding name that the input stream
+ * is encoded using.
+ *
+ * @return Returns a reader.
+ */
+ protected Reader createReader(InputStream inputStream, String ianaEncoding)
+ throws IOException {
+
+ // normalize encoding name
+ if (ianaEncoding == null) {
+ ianaEncoding = "UTF-8";
+ }
+ ianaEncoding = ianaEncoding.toUpperCase();
+
+ // try to use an optimized reader
+ if (ianaEncoding.equals("UTF-8")) {
+ return new UTF8Reader(inputStream, fBufferSize);
+ }
+ if (ianaEncoding.equals("US-ASCII")) {
+ return new ASCIIReader(inputStream, fBufferSize);
+ }
+
+ // try to use a Java reader
+ String javaEncoding = EncodingMap.getIANA2JavaMapping(ianaEncoding);
+ return new InputStreamReader(inputStream, javaEncoding);
+
+ } // createReader(InputStream,String):
+
+ //
+ // Protected static methods
+ //
+
+ /**
* Fixes a platform dependent filename to standard URI form.
*
* @param str The string to fix.
*
* @return Returns the fixed URI string.
*/
- private static String fixURI(String str) {
+ protected static String fixURI(String str) {
// handle platform dependent strings
str = str.replace(java.io.File.separatorChar, '/');
@@ -903,6 +936,10 @@
return str;
} // fixURI(String):String
+
+ //
+ // Package visible methods
+ //
/** Prints the contents of the buffer. */
final void print() {
No revision
No revision
1.1.2.1 +244 -0 xml-xerces/java/src/org/apache/xerces/impl/io/Attic/ASCIIReader.java
1.1.2.1 +399 -0 xml-xerces/java/src/org/apache/xerces/impl/io/Attic/UTF8Reader.java