You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by ne...@apache.org on 2002/09/11 22:22:35 UTC
cvs commit: xml-xerces/java/src/org/apache/xerces/impl XMLEntityManager.java XMLEntityScanner.java XMLDocumentFragmentScannerImpl.java XMLScanner.java
neilg 2002/09/11 13:22:35
Modified: java/src/org/apache/xerces/impl XMLEntityManager.java
XMLEntityScanner.java
XMLDocumentFragmentScannerImpl.java XMLScanner.java
Log:
this change attempts to address poor performance parsing documents with very large comments.
I observed between 10% and 15% improvement (depending on the kind of parser being used) on a 200K file with a 100K comment.
Revision Changes Path
1.43 +123 -127 xml-xerces/java/src/org/apache/xerces/impl/XMLEntityManager.java
Index: XMLEntityManager.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLEntityManager.java,v
retrieving revision 1.42
retrieving revision 1.43
diff -u -r1.42 -r1.43
--- XMLEntityManager.java 10 Sep 2002 14:20:07 -0000 1.42
+++ XMLEntityManager.java 11 Sep 2002 20:22:35 -0000 1.43
@@ -79,6 +79,7 @@
import org.apache.xerces.impl.validation.ValidationManager;
import org.apache.xerces.util.EncodingMap;
+import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.util.SymbolTable;
import org.apache.xerces.util.URI;
import org.apache.xerces.util.XMLChar;
@@ -2740,7 +2741,7 @@
} // scanLiteral(int,XMLString):int
/**
- * Scans a range of character data up to the specicied delimiter,
+ * Scans a range of character data up to the specified delimiter,
* setting the fields of the XMLString structure, appropriately.
* <p>
* <strong>Note:</strong> The characters are consumed.
@@ -2769,166 +2770,161 @@
* @throws IOException Thrown if i/o error occurs.
* @throws EOFException Thrown on end of file.
*/
- public boolean scanData(String delimiter, XMLString data)
+ public boolean scanData(String delimiter, XMLStringBuffer buffer)
throws IOException {
- if (DEBUG_BUFFER) {
- System.out.print("(scanData: ");
- print();
- System.out.println();
- }
-
- // load more characters, if needed
+ boolean done = false;
int delimLen = delimiter.length();
char charAt0 = delimiter.charAt(0);
- //int limit = fCurrentEntity.count - delimLen + 1;
-
- if (fCurrentEntity.position == fCurrentEntity.count) {
- load(0, true);
- }
- else if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) {
- System.arraycopy(fCurrentEntity.ch, fCurrentEntity.position,
- fCurrentEntity.ch, 0, fCurrentEntity.count - fCurrentEntity.position);
- load(fCurrentEntity.count - fCurrentEntity.position, false);
- fCurrentEntity.position = 0;
- }
- if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) {
- // something must be wrong with the input: e.g., file ends an unterminated comment
- int length = fCurrentEntity.count - fCurrentEntity.position;
- data.setValues(fCurrentEntity.ch, fCurrentEntity.position, length);
- fCurrentEntity.columnNumber += fCurrentEntity.count;
- fCurrentEntity.position = fCurrentEntity.count;
- load(0,true);
- return false;
- }
-
- // normalize newlines
int offset = fCurrentEntity.position;
int c = fCurrentEntity.ch[offset];
int newlines = 0;
boolean external = fCurrentEntity.isExternal();
- if (c == '\n' || (c == '\r' && external)) {
+ do {
if (DEBUG_BUFFER) {
- System.out.print("[newline, "+offset+", "+fCurrentEntity.position+": ");
+ System.out.print("(scanData: ");
print();
System.out.println();
}
- do {
- c = fCurrentEntity.ch[fCurrentEntity.position++];
- if (c == '\r' && external) {
- newlines++;
- fCurrentEntity.lineNumber++;
- fCurrentEntity.columnNumber = 1;
- /***/
- if (fCurrentEntity.position == fCurrentEntity.count) {
- offset = 0;
- fCurrentEntity.position = newlines;
- if (load(newlines, false)) {
- break;
+
+ // load more characters, if needed
+
+ if (fCurrentEntity.position == fCurrentEntity.count) {
+ load(0, true);
+ }
+ else if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) {
+ System.arraycopy(fCurrentEntity.ch, fCurrentEntity.position,
+ fCurrentEntity.ch, 0, fCurrentEntity.count - fCurrentEntity.position);
+ load(fCurrentEntity.count - fCurrentEntity.position, false);
+ fCurrentEntity.position = 0;
+ }
+ if (fCurrentEntity.position >= fCurrentEntity.count - delimLen) {
+ // something must be wrong with the input: e.g., file ends an unterminated comment
+ int length = fCurrentEntity.count - fCurrentEntity.position;
+ buffer.append (fCurrentEntity.ch, fCurrentEntity.position, length);
+ fCurrentEntity.columnNumber += fCurrentEntity.count;
+ fCurrentEntity.position = fCurrentEntity.count;
+ load(0,true);
+ return false;
+ }
+
+ // normalize newlines
+ offset = fCurrentEntity.position;
+ c = fCurrentEntity.ch[offset];
+ newlines = 0;
+ if (c == '\n' || (c == '\r' && external)) {
+ if (DEBUG_BUFFER) {
+ System.out.print("[newline, "+offset+", "+fCurrentEntity.position+": ");
+ print();
+ System.out.println();
+ }
+ do {
+ c = fCurrentEntity.ch[fCurrentEntity.position++];
+ if (c == '\r' && external) {
+ newlines++;
+ fCurrentEntity.lineNumber++;
+ fCurrentEntity.columnNumber = 1;
+ if (fCurrentEntity.position == fCurrentEntity.count) {
+ offset = 0;
+ fCurrentEntity.position = newlines;
+ if (load(newlines, false)) {
+ break;
+ }
+ }
+ if (fCurrentEntity.ch[fCurrentEntity.position] == '\n') {
+ fCurrentEntity.position++;
+ offset++;
+ }
+ /*** NEWLINE NORMALIZATION ***/
+ else {
+ newlines++;
}
}
- /***/
- if (fCurrentEntity.ch[fCurrentEntity.position] == '\n') {
- fCurrentEntity.position++;
- offset++;
- }
- /*** NEWLINE NORMALIZATION ***/
- else {
+ else if (c == '\n') {
newlines++;
- }
- /***/
- }
- else if (c == '\n') {
- newlines++;
- fCurrentEntity.lineNumber++;
- fCurrentEntity.columnNumber = 1;
- /***/
- if (fCurrentEntity.position == fCurrentEntity.count) {
- offset = 0;
- fCurrentEntity.position = newlines;
- fCurrentEntity.count = newlines;
- if (load(newlines, false)) {
- break;
+ fCurrentEntity.lineNumber++;
+ fCurrentEntity.columnNumber = 1;
+ if (fCurrentEntity.position == fCurrentEntity.count) {
+ offset = 0;
+ fCurrentEntity.position = newlines;
+ fCurrentEntity.count = newlines;
+ if (load(newlines, false)) {
+ break;
+ }
}
}
- /***/
- /*** NEWLINE NORMALIZATION ***
- if (fCurrentEntity.ch[fCurrentEntity.position] == '\r'
- && external) {
- fCurrentEntity.position++;
- offset++;
+ else {
+ fCurrentEntity.position--;
+ break;
}
- /***/
+ } while (fCurrentEntity.position < fCurrentEntity.count - 1);
+ for (int i = offset; i < fCurrentEntity.position; i++) {
+ fCurrentEntity.ch[i] = '\n';
}
- else {
- fCurrentEntity.position--;
- break;
+ int length = fCurrentEntity.position - offset;
+ if (fCurrentEntity.position == fCurrentEntity.count - 1) {
+ buffer.append(fCurrentEntity.ch, offset, length);
+ if (DEBUG_BUFFER) {
+ System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": ");
+ print();
+ System.out.println();
+ }
+ return true;
}
- } while (fCurrentEntity.position < fCurrentEntity.count - 1);
- for (int i = offset; i < fCurrentEntity.position; i++) {
- fCurrentEntity.ch[i] = '\n';
- }
- int length = fCurrentEntity.position - offset;
- if (fCurrentEntity.position == fCurrentEntity.count - 1) {
- data.setValues(fCurrentEntity.ch, offset, length);
if (DEBUG_BUFFER) {
System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": ");
print();
System.out.println();
}
- return true;
- }
- if (DEBUG_BUFFER) {
- System.out.print("]newline, "+offset+", "+fCurrentEntity.position+": ");
- print();
- System.out.println();
}
- }
-
- // iterate over buffer looking for delimiter
- boolean done = false;
- OUTER: while (fCurrentEntity.position < fCurrentEntity.count) {
- c = fCurrentEntity.ch[fCurrentEntity.position++];
- if (c == charAt0) {
- // looks like we just hit the delimiter
- int delimOffset = fCurrentEntity.position - 1;
- for (int i = 1; i < delimLen; i++) {
- if (fCurrentEntity.position == fCurrentEntity.count) {
- fCurrentEntity.position -= i;
- break OUTER;
+
+ // iterate over buffer looking for delimiter
+ OUTER: while (fCurrentEntity.position < fCurrentEntity.count) {
+ c = fCurrentEntity.ch[fCurrentEntity.position++];
+ if (c == charAt0) {
+ // looks like we just hit the delimiter
+ int delimOffset = fCurrentEntity.position - 1;
+ for (int i = 1; i < delimLen; i++) {
+ if (fCurrentEntity.position == fCurrentEntity.count) {
+ fCurrentEntity.position -= i;
+ break OUTER;
+ }
+ c = fCurrentEntity.ch[fCurrentEntity.position++];
+ if (delimiter.charAt(i) != c) {
+ fCurrentEntity.position--;
+ break;
+ }
}
- c = fCurrentEntity.ch[fCurrentEntity.position++];
- if (delimiter.charAt(i) != c) {
- fCurrentEntity.position--;
+ if (fCurrentEntity.position == delimOffset + delimLen) {
+ done = true;
break;
}
}
- if (fCurrentEntity.position == delimOffset + delimLen) {
- done = true;
+ else if (c == '\n' || (external && c == '\r')) {
+ fCurrentEntity.position--;
break;
}
+ else if (XMLChar.isInvalid(c)) {
+ fCurrentEntity.position--;
+ int length = fCurrentEntity.position - offset;
+ fCurrentEntity.columnNumber += length - newlines;
+ buffer.append(fCurrentEntity.ch, offset, length);
+ return true;
+ }
}
- else if (c == '\n' || (external && c == '\r')) {
- fCurrentEntity.position--;
- break;
+ int length = fCurrentEntity.position - offset;
+ fCurrentEntity.columnNumber += length - newlines;
+ if (done) {
+ length -= delimLen;
}
- else if (XMLChar.isInvalid(c)) {
- fCurrentEntity.position--;
- break;
+ buffer.append (fCurrentEntity.ch, offset, length);
+
+ // return true if string was skipped
+ if (DEBUG_BUFFER) {
+ System.out.print(")scanData: ");
+ print();
+ System.out.println(" -> " + done);
}
- }
- int length = fCurrentEntity.position - offset;
- fCurrentEntity.columnNumber += length - newlines;
- if (done) {
- length -= delimLen;
- }
- data.setValues(fCurrentEntity.ch, offset, length);
-
- // return true if string was skipped
- if (DEBUG_BUFFER) {
- System.out.print(")scanData: ");
- print();
- System.out.println(" -> " + done);
- }
+ } while (!done);
return !done;
} // scanData(String,XMLString)
1.7 +2 -1 xml-xerces/java/src/org/apache/xerces/impl/XMLEntityScanner.java
Index: XMLEntityScanner.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLEntityScanner.java,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- XMLEntityScanner.java 10 Sep 2002 14:20:07 -0000 1.6
+++ XMLEntityScanner.java 11 Sep 2002 20:22:35 -0000 1.7
@@ -63,6 +63,7 @@
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLString;
+import org.apache.xerces.util.XMLStringBuffer;
/**
* This class allows various parser scanners to scan basic XML constructs
@@ -288,7 +289,7 @@
* @throws IOException Thrown if i/o error occurs.
* @throws EOFException Thrown on end of file.
*/
- public abstract boolean scanData(String delimiter, XMLString data)
+ public abstract boolean scanData(String delimiter, XMLStringBuffer data)
throws IOException;
/**
1.19 +5 -4 xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java
Index: XMLDocumentFragmentScannerImpl.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLDocumentFragmentScannerImpl.java,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -r1.18 -r1.19
--- XMLDocumentFragmentScannerImpl.java 10 Sep 2002 14:20:07 -0000 1.18
+++ XMLDocumentFragmentScannerImpl.java 11 Sep 2002 20:22:35 -0000 1.19
@@ -898,9 +898,10 @@
}
while (true) {
- if (!fEntityScanner.scanData("]]", fString)) {
- if (fDocumentHandler != null && fString.length > 0) {
- fDocumentHandler.characters(fString, null);
+ fStringBuffer.clear();
+ if (!fEntityScanner.scanData("]]", fStringBuffer)) {
+ if (fDocumentHandler != null && fStringBuffer.length > 0) {
+ fDocumentHandler.characters(fStringBuffer, null);
}
int brackets = 2;
while (fEntityScanner.skipChar(']')) {
@@ -924,7 +925,7 @@
}
else {
if (fDocumentHandler != null) {
- fDocumentHandler.characters(fString, null);
+ fDocumentHandler.characters(fStringBuffer, null);
}
int c = fEntityScanner.peekChar();
if (c != -1 && XMLChar.isInvalid(c)) {
1.20 +4 -9 xml-xerces/java/src/org/apache/xerces/impl/XMLScanner.java
Index: XMLScanner.java
===================================================================
RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/impl/XMLScanner.java,v
retrieving revision 1.19
retrieving revision 1.20
diff -u -r1.19 -r1.20
--- XMLScanner.java 10 Sep 2002 14:20:07 -0000 1.19
+++ XMLScanner.java 11 Sep 2002 20:22:35 -0000 1.20
@@ -637,11 +637,10 @@
}
}
+ fStringBuffer.clear();
// data
- if (fEntityScanner.scanData("?>", data)) {
- fStringBuffer.clear();
+ if (fEntityScanner.scanData("?>", fStringBuffer)) {
do {
- fStringBuffer.append(data);
int c = fEntityScanner.peekChar();
if (c != -1) {
if (XMLChar.isHighSurrogate(c)) {
@@ -653,8 +652,7 @@
fEntityScanner.scanChar();
}
}
- } while (fEntityScanner.scanData("?>", data));
- fStringBuffer.append(data);
+ } while (fEntityScanner.scanData("?>", fStringBuffer));
data.setValues(fStringBuffer);
}
@@ -679,9 +677,7 @@
// text
// REVISIT: handle invalid character, eof
text.clear();
- while (fEntityScanner.scanData("--", fString)) {
- text.append(fString);
- /***/
+ while (fEntityScanner.scanData("--", text)) {
int c = fEntityScanner.peekChar();
if (c != -1) {
if (XMLChar.isHighSurrogate(c)) {
@@ -694,7 +690,6 @@
}
}
}
- text.append(fString);
if (!fEntityScanner.skipChar('>')) {
reportFatalError("DashDashInComment", null);
}
---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org