You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by sa...@apache.org on 2003/06/17 19:57:14 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/impl/io UTF8Reader.java
sandygao 2003/06/17 10:57:14
Modified: java/src/org/apache/xerces/impl/io UTF8Reader.java
Log:
A performance improvement for UTF8 scanning.
Revision Changes Path
1.6 +52 -39 xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java
Index: UTF8Reader.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/io/UTF8Reader.java,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -r1.5 -r1.6
--- UTF8Reader.java 23 Jul 2002 20:53:19 -0000 1.5
+++ UTF8Reader.java 17 Jun 2003 17:57:14 -0000 1.6
@@ -113,8 +113,8 @@
// Constructors
//
- /**
- * Constructs a UTF-8 reader from the specified input stream
+ /**
+ * Constructs a UTF-8 reader from the specified input stream
* using the default buffer size. Primarily for testing.
*
* @param inputStream The input stream.
@@ -123,8 +123,8 @@
this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
} // <init>(InputStream, MessageFormatter)
- /**
- * Constructs a UTF-8 reader from the specified input stream
+ /**
+ * Constructs a UTF-8 reader from the specified input stream
* using the default buffer size and the given MessageFormatter.
*
* @param inputStream The input stream.
@@ -136,8 +136,8 @@
this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
} // <init>(InputStream, MessageFormatter)
- /**
- * Constructs a UTF-8 reader from the specified input stream,
+ /**
+ * Constructs a UTF-8 reader from the specified input stream,
* buffer size and MessageFormatter.
*
* @param inputStream The input stream.
@@ -180,7 +180,7 @@
int index = 0;
// get first byte
- int b0 = index == fOffset
+ int b0 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b0 == -1) {
return -1;
@@ -195,7 +195,7 @@
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
else if ((b0 & 0xE0) == 0xC0) {
- int b1 = index == fOffset
+ int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 2);
@@ -217,7 +217,7 @@
if ((b1 & 0xC0) != 0x80) {
invalidByte(2, 3, b1);
}
- int b2 = index == fOffset
+ int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 3);
@@ -234,7 +234,7 @@
// [1101 11yy] [yyxx xxxx] (low surrogate)
// * uuuuu = wwww + 1
else if ((b0 & 0xF8) == 0xF0) {
- int b1 = index == fOffset
+ int b1 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b1 == -1) {
expectedByte(2, 4);
@@ -242,7 +242,7 @@
if ((b1 & 0xC0) != 0x80) {
invalidByte(2, 3, b1);
}
- int b2 = index == fOffset
+ int b2 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b2 == -1) {
expectedByte(3, 4);
@@ -250,7 +250,7 @@
if ((b2 & 0xC0) != 0x80) {
invalidByte(3, 3, b2);
}
- int b3 = index == fOffset
+ int b3 = index == fOffset
? fInputStream.read() : fBuffer[index++] & 0x00FF;
if (b3 == -1) {
expectedByte(4, 4);
@@ -263,8 +263,8 @@
invalidSurrogate(uuuuu);
}
int wwww = uuuuu - 1;
- int hs = 0xD800 |
- ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
+ int hs = 0xD800 |
+ ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
((b2 >> 4) & 0x0003);
int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
c = hs;
@@ -334,7 +334,7 @@
// skip read; last character was in error
// NOTE: Having an offset value other than zero means that there was
// an error in the last character read. In this case, we have
- // skipped the read so we don't consume any bytes past the
+ // skipped the read so we don't consume any bytes past the
// error. By signalling the error on the next block read we
// allow the method to return the most valid characters that
// it can on the previous block read. -Ac
@@ -345,22 +345,35 @@
// convert bytes to characters
final int total = count;
- for (int in = 0; in < total; in++) {
- int b0 = fBuffer[in] & 0x00FF;
+ int in;
+ byte byte1;
+ final byte byte0 = 0;
+ for (in = 0; in < total; in++) {
+ byte1 = fBuffer[in];
+ if (byte1 >= byte0) {
+ ch[out++] = (char)byte1;
+ }
+ else {
+ break;
+ }
+ }
+ for ( ; in < total; in++) {
+ byte1 = fBuffer[in];
// UTF-8: [0xxx xxxx]
// Unicode: [0000 0000] [0xxx xxxx]
- if (b0 < 0x80) {
- ch[out++] = (char)b0;
+ if (byte1 >= byte0) {
+ ch[out++] = (char)byte1;
continue;
}
// UTF-8: [110y yyyy] [10xx xxxx]
// Unicode: [0000 0yyy] [yyxx xxxx]
+ int b0 = byte1 & 0x0FF;
if ((b0 & 0xE0) == 0xC0) {
int b1 = -1;
- if (++in < total) {
- b1 = fBuffer[in] & 0x00FF;
+ if (++in < total) {
+ b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
@@ -393,8 +406,8 @@
// Unicode: [zzzz yyyy] [yyxx xxxx]
if ((b0 & 0xF0) == 0xE0) {
int b1 = -1;
- if (++in < total) {
- b1 = fBuffer[in] & 0x00FF;
+ if (++in < total) {
+ b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
@@ -418,8 +431,8 @@
invalidByte(2, 3, b1);
}
int b2 = -1;
- if (++in < total) {
- b2 = fBuffer[in] & 0x00FF;
+ if (++in < total) {
+ b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
@@ -457,8 +470,8 @@
// * uuuuu = wwww + 1
if ((b0 & 0xF8) == 0xF0) {
int b1 = -1;
- if (++in < total) {
- b1 = fBuffer[in] & 0x00FF;
+ if (++in < total) {
+ b1 = fBuffer[in] & 0x00FF;
}
else {
b1 = fInputStream.read();
@@ -482,8 +495,8 @@
invalidByte(2, 4, b1);
}
int b2 = -1;
- if (++in < total) {
- b2 = fBuffer[in] & 0x00FF;
+ if (++in < total) {
+ b2 = fBuffer[in] & 0x00FF;
}
else {
b2 = fInputStream.read();
@@ -509,8 +522,8 @@
invalidByte(3, 4, b2);
}
int b3 = -1;
- if (++in < total) {
- b3 = fBuffer[in] & 0x00FF;
+ if (++in < total) {
+ b3 = fBuffer[in] & 0x00FF;
}
else {
b3 = fInputStream.read();
@@ -614,14 +627,14 @@
* @exception IOException If an I/O error occurs
*/
public boolean ready() throws IOException {
- return false;
+ return false;
} // ready()
/**
* Tell whether this stream supports the mark() operation.
*/
public boolean markSupported() {
- return false;
+ return false;
} // markSupported()
/**
@@ -638,7 +651,7 @@
* or if some other I/O error occurs
*/
public void mark(int readAheadLimit) throws IOException {
- throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
+ throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
} // mark(int)
/**
@@ -685,10 +698,10 @@
} // expectedByte(int,int,int)
/** Throws an exception for invalid byte. */
- private void invalidByte(int position, int count, int c)
+ private void invalidByte(int position, int count, int c)
throws UTFDataFormatException {
- String message = fFormatter.formatMessage(fLocale, "InvalidByte",
+ String message = fFormatter.formatMessage(fLocale, "InvalidByte",
new Object [] {Integer.toString(position), Integer.toString(count)});
throw new UTFDataFormatException(message);
@@ -696,11 +709,11 @@
/** Throws an exception for invalid surrogate bits. */
private void invalidSurrogate(int uuuuu) throws UTFDataFormatException {
-
+
StringBuffer str = new StringBuffer();
str.append("high surrogate bits in UTF-8 sequence must not exceed 0x10 but found 0x");
- String message = fFormatter.formatMessage(fLocale, "InvalidHighSurrogate",
+ String message = fFormatter.formatMessage(fLocale, "InvalidHighSurrogate",
new Object[] {Integer.toHexString(uuuuu)});
throw new UTFDataFormatException(message);
---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org