You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by mm...@locus.apache.org on 2000/08/17 23:14:55 UTC
cvs commit: xml-xalan/java/src/org/apache/xalan/dtm DTM.java
mmidy 00/08/17 14:14:54
Modified: java/src/org/apache/xalan/dtm DTM.java
Log:
Checking this in for Joe Kesselman: Fix for EntityRefs.
Revision Changes Path
1.3 +206 -26 xml-xalan/java/src/org/apache/xalan/dtm/DTM.java
Index: DTM.java
===================================================================
RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/dtm/DTM.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- DTM.java 2000/07/05 14:37:44 1.2
+++ DTM.java 2000/08/17 21:14:53 1.3
@@ -87,16 +87,29 @@
/**
* <meta name="usage" content="internal"/>
* <code>DTM</code> is an XML document model expressed as a table rather than
- * an object tree. It attempts to be very compact, and to support very
- * specifically limited multitasking: users can start reading the document
- * while it's still being generated. (A work in progress...)
+ * an object tree. Its goals were:
+ * <ul>
+ * <li>to be very compact,</li>
+ * <li>to avoid object-creation overhead,</li>
+ * <li>to offer improved locality-of-reference for forward scan of documents,
+ * </li>
+ * <li>to provide inherent document-order sequencing,</li>
+ * <li>and to support a very specifically limited form of multitasking:
+ * users can start reading the document while it's still being generated,
+ * with very little synchronization overhead. (In fact, we seem to have
+ * established that this feature is of negligable value in a single-processor
+ * system; greater benefit might be obtained by instead tapping into XML4J's
+ * "demand parsing" scheme.)</li>
+ * </ul>
* <p>
- * (***** The SAX handler calls, and the string-based XMLContentHandler
- * methods, are known to be bad; they're included as hooks for the future.)</p>
+ * DTM does _not_ directly support the W3C's Document Object Model API.
+ * However, it attempts to come close enough to the DOM's behavior that a
+ * subset of the DOM can be implemented as proxy objects referencing the DTM.
+ * </p>
* <p>
- * DTM does _not_ directly support the W3C's Document Object Model. However,
- * it attempts to come close enough that a subset of DOM behavior can be
- * implemented as proxy objects referencing the DTM.</p>
+ * Note: The SAX handler calls, and the string-based XMLContentHandler
+ * methods, were known to be bad and have been disabled. The API is still
+ * present as hooks for future deveopment.</p>
* @see DTMProxy
*/
public class DTM
@@ -129,7 +142,7 @@
private boolean processingIgnorableWhitespace = false;
private boolean processingCDATASection = false;
private boolean previousSiblingWasParent = false;
- // Local cache for record-at-a-time fetch
+ // Local cache for record-at-a-time fetch
int gotslot[] = new int[4];
// Unique-string-to-integer conversions, for use with SAX.
@@ -140,8 +153,9 @@
// MANEFEST CONSTANTS
// Status bits, ORed with node type (assumed to be <256, should be safe)
- final int TEXT_IGNORABLE = 2 << 8;
- final int TEXT_CDATA = 4 << 8;
+ final int TEXT_DTM_POOL = 1 << 8; // Locally cached, eg concatenation
+ final int TEXT_IGNORABLE = 2 << 8; // Whitespace in element context
+ final int TEXT_CDATA = 4 << 8; // CDATA node
// Impossible prefix to look up the default namespace
public static final String DEFAULT_PREFIX_STR = "#:::";
@@ -334,6 +348,7 @@
throws org.xml.sax.SAXException
{
if(DISABLE)return;
+ appendAccumulatedText();
done=true;
@@ -372,6 +387,8 @@
int attrListIndex)
{
if(DISABLE)return;
+ appendAccumulatedText();
+
// Need to retrive the attrList...
String attrname, attrvalue;
@@ -481,9 +498,10 @@
previousSibling=ourslot;
// Create attribute substructure.
- // ***** Current XML4J will _only_ yield a single text,
+ // ***** We assume XML4J will _only_ yield a single text,
// rather than attempting to retain EntityReference nodes
- // within Attribute values.
+ // within Attribute values. Note that this counts on
+ // XML4J handling the split-buffer issue for us.
// ***** DTMProxy currently assumes this behavior!
// W0 Low: Node Type, with flag if ignorable whitespace
// W0 High: Buffer index (in SAX mode) or 0 (XML4J mode)
@@ -521,6 +539,8 @@
public final void endElement(QName name)
{
if(DISABLE)return;
+ appendAccumulatedText();
+
int thisElement = currentParent;
// If last node appended before we pop has a next-sib reference,
@@ -603,17 +623,20 @@
/** Start CDATA section. */
public final void startCDATA() throws Exception
{
+ // No-op; XSLT considers only the contained text.
}
/** End CDATA section. */
public final void endCDATA() throws Exception
{
+ // No-op; XSLT considers only the contained text.
}
/** Ignorable whitespace. */
public final void ignorableWhitespace(int dataIndex)
throws Exception
{
+ if(DISABLE)return;
general_characters(dataIndex);
}
@@ -670,29 +693,140 @@
public final void ignorableWhitespace(int dataIndex, boolean cdataSection)
throws org.xml.sax.SAXException
{
+ if(DISABLE)return;
processingIgnorableWhitespace = true;
general_characters(dataIndex);
}
+ // Vector handles objects. Too much overhead. I could use ChunkedIntArray
+ // (and did, in an early draft), but since we aren't trying to handle SAX
+ // right now there's no need for the additional columns. So we'll use a
+ // simple grow-it-myself array.
+ int charChunks[]=new int[100];
+ int charChunkStart=0,charChunkCount=0;
+
/** Text-accumulator operation for the integer-index version of
* characters(). Obviously far simpler, since we are assured that
* (unlike the parse buffers) the XML4J symbol table will persist.
* @param index int Index of this string in XML4J's symbol tables.
+ *<p>
+ * Note: Even though we are using XML4J's internal events rather than SAX,
+ * we <strong>must</strong> be prepared to normalize successive blocks
+ * of characters():
+ * <ul>
+ * <li>when text runs over the end of a parse buffer (may not arise in
+ * this parser),</li>
+ * <li>when text and CDATA sections are intermixed (with intervening
+ * start/end CDATA events),</li>
+ * <li>and when text and entity references are intermixed (with intervening
+ * start/end Entity Reference events).</li>
+ * </ul>
+ * The simplest way to handle this is to record the data, but defer
+ * creating the Text node until we get an event indicating that no further
+ * text will arrive. This logic was present in early versions of DTM,
+ * but was lost during an overagressive optimization; we're restoring it now.
+ *<p>
+ * Note: Yes, the charChunks array grows monotonically during parsing,
+ * and does not shrink back down when the chunks are concatenated later
+ * in processing. Tough. I'm assuming that this is cheaper than allocating
+ * a separate array for every multichunk string, despite the block-copying
+ * that occurs when the array is grown.
*<p>
- * KNOWN LIMITATION: DOESN'T PRESERVE CDATA FLAG.
+ * KNOWN LIMITATION: DOESN'T PRESERVE CDATA FLAG. Since XSLT doesn't
+ * care about that flag, this is not a problem for our target
+ * application. It may be an issue if you try to reuse DTM elsewhere.
+ *
+ * @see appendAccumulatedText
*/
public final void general_characters(int index)
{
- // Add this element to the document
- int w0 = Node.TEXT_NODE;
- // W1: Parent
- int w1 = currentParent;
- // W2: Start position within buffer (SAX), or text index (XML4J)
- int w2 = index;
- // W3: Length of this text (SAX), or 0 (XML4J)
- int w3 = gotslot[2];
- int ourslot = appendNode(w0, w1, w2, w3);
- previousSibling = ourslot;
+ // Grow the array, if out of space. (Doubling may be excessive, but the
+ // goal is to trade off minimum memory use versus minimum recopying.)
+ if(charChunkCount==charChunks.length)
+ {
+ int[] newCharChunks=new int[2*charChunks.length];
+ System.arraycopy(charChunks,0,newCharChunks,0,charChunks.length);
+ charChunks=newCharChunks;
+ }
+ // Append to the array
+ charChunks[charChunkCount++]=index;
+ }
+
+ /** appendAccumulatedText completes the work started by
+ * general_characters(). It takes all the blocks of text which have
+ * arrived, and generates a single Text node containing their
+ * concatenated value. This routine _MUST_ be called at the first step
+ * in processing any other event.
+ *<p>
+ * There are a few reasonable ways of handling this.
+ * <ul>
+ * <li> One is to hold onto the individual text chunks -- which are
+ * already in a string pool inside the parser, since we're being
+ * driven through XMLDocumentHandler -- and concatenate them on
+ * demand when the user asks for this node's value; this minimizes
+ * model-building time, especially if the user never asks for the
+ * value of this node.</li>
+ *<li>The other is to generate a concatenated string in a local
+ * pool; this avoids re-concatenating the string if it should be
+ * accessed more than once.</li>
+ * <li>Or we could use the first solution, but convert it to the second
+ * the first time the text node is accessed. This is probably the best
+ * of both worlds... and we can get away with it because DTM is
+ * explicitly single-threaded after parsing, so there will be no
+ * contention for the node during its conversion.</li>
+ * </ul>
+ * <p>
+ * Early versions of DTM chose the first answer. I'm going to try the third
+ * this time.
+ *<p>
+ * Length of 0 indicates the simple case, referenced directly from
+ * the parser's pool.
+ *<p>
+
+ * @see general_characters() */
+ void appendAccumulatedText()
+ {
+ if(charChunkCount==charChunkStart)
+ return; // No new text.
+ else if(charChunkCount==charChunkStart+1)
+ {
+ // Single chunk. We can use the efficient inline version of Text
+
+ int w0 = Node.TEXT_NODE;
+ // W1: Parent
+ int w1 = currentParent;
+ // W2: Start position within charChunks (multiple),
+ // or text index (inline), or local text index (multiple converted)
+ int w2 = charChunks[charChunkStart];
+ // W3: Start of next sequence, or 0 for inline
+ int w3 = 0;
+ int ourslot = appendNode(w0, w1, w2, w3);
+ previousSibling = ourslot;
+
+ // This chunk has been completely processed, so reuse its chunk slot
+ // (They're cheap, but why waste them?)
+ --charChunkCount;
+ }
+ else
+ {
+ // Here's our problem child. We need to record that the Text node's
+ // value is represented by a sequence of nodes in
+ int w0 = Node.TEXT_NODE;
+ // W1: Parent
+ int w1 = currentParent;
+ // W2: Start position within charChunks (multiple),
+ // or text index (inline), or local text index (multiple converted)
+ int w2 = charChunkStart;
+ // W3: Start of next sequence, or 0 for inline
+ int w3 = charChunkCount;
+ int ourslot = appendNode(w0, w1, w2, w3);
+ previousSibling = ourslot;
+
+ // This time, we need to remember that these charChunks can _NOT_
+ // be reused -- leave the high-water mark alone, and instead move
+ // the baseline up.
+ charChunkStart=charChunkCount;
+ }
}
/**
@@ -703,6 +837,7 @@
public final void comment(int dataIndex)
{
if(DISABLE)return;
+ appendAccumulatedText();
// Short Form, XML4J mode
int w0, w1, w2, w3;
@@ -730,6 +865,7 @@
public final void processingInstruction(int target, int data)
{
if(DISABLE)return;
+ appendAccumulatedText();
// W0 Low: Node Type.
int w0 = org.w3c.dom.Node.PROCESSING_INSTRUCTION_NODE;
@@ -1862,6 +1998,9 @@
return intToString(w0>>16);
}
+ // Cache conversions of multi-charChunk text nodes
+ Vector localStringPool=new Vector();
+
/**
* DTM read API: Given a node index, return its node value. This is mostly
* as defined by the DOM, but may ignore some conveniences.
@@ -1880,7 +2019,48 @@
{
case Node.TEXT_NODE:
case Node.CDATA_SECTION_NODE: // We handle as flagged Text...
- value=intToString(gotslot[2]);
+ if((gotslot[0] & TEXT_DTM_POOL) != 0)
+ {
+ // Value of this node lives in DTM's pool, not in the parser's
+ value=(String)(localStringPool.elementAt(gotslot[2]));
+ }
+ else if(gotslot[3]>0) // (actually >1, but 1 never occurs)
+ {
+ // This was a multi-charChunk node. Its value is the concatenation
+ // of those chunks. For efficient future access, we will now convert
+ // this into a TEXT_DTM_POOL node
+
+ // First, concatenate the chunks to obtain the value
+ int chunk=gotslot[2],stop=gotslot[3];
+ StringBuffer sb=new StringBuffer(intToString(charChunks[chunk++]));
+ while(chunk<stop)
+ sb.append(intToString(charChunks[chunk++]));
+ value=sb.toString();
+
+ // Add the normalized string to our local pool.
+ // ****** Is it worth suppressing duplicates?
+ // int localStringNumber=localStringPool.indexOf(value);
+ // if(-1 == localStringNumber) // Not found
+ // {
+ localStringPool.addElement(value);
+ int localStringNumber=localStringPool.size();
+ // }
+
+ // Now back-patch the node. We can get away with not protecting
+ // this since we assert that DTM's read access is single-threaded,
+ // and hence nobody else is accessing this node right now.
+ // (If you don't believe that, synchronize this and the preceeding
+ // case on localStringPool.)
+ gotslot[0] |= TEXT_DTM_POOL;
+ gotslot[2] = localStringNumber-1;
+ // ***** Would be nice right here to have an array-to-array write...
+ nodes.writeSlot(position,gotslot[0],gotslot[1],gotslot[2],gotslot[3]);
+ }
+ else
+ {
+ // Single charChunk. Read the value direct from the parser's pool.
+ value=intToString(gotslot[2]);
+ }
break;
case Node.PROCESSING_INSTRUCTION_NODE:
case Node.COMMENT_NODE:
@@ -2138,4 +2318,4 @@
}
-}
\ No newline at end of file
+}