You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by jk...@apache.org on 2024/02/02 19:02:24 UTC
(xalan-java) 01/01: just documentation/parameter names
This is an automated email from the ASF dual-hosted git repository.
jkesselm pushed a commit to branch XALANJ-2725
in repository https://gitbox.apache.org/repos/asf/xalan-java.git
commit 162e1f0b4c71669e3c8da8c6d1b7b4ddcdda5789
Author: kubycsolutions <ke...@kubyc.solutions>
AuthorDate: Fri Feb 2 14:02:15 2024 -0500
just documentation/parameter names
---
.../java/org/apache/xml/serializer/ToStream.java | 71 +++++++++++++++++-----
.../org/apache/xml/serializer/ToTextStream.java | 3 +-
2 files changed, 57 insertions(+), 17 deletions(-)
diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
index 6d94582c..8619d61c 100644
--- a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
+++ b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
@@ -47,8 +47,10 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * This abstract class is a base class for other stream
- * serializers (xml, html, text ...) that write output to a stream.
+ * This abstract class is a base class for other stream serializers
+ * (xml, html, text ...) that write output to a stream. Note that
+ * this is stateful, NOT designed to be multithreaded; each thread and
+ * each output stream should have its own instance.
*
* @xsl.usage internal
*/
@@ -61,7 +63,6 @@ abstract public class ToStream extends SerializerBase
/** Stack to keep track of disabling output escaping. */
protected BoolStack m_disableOutputEscapingStates = new BoolStack();
-
/**
* The encoding information associated with this serializer.
* Although initially there is no encoding,
@@ -174,7 +175,40 @@ abstract public class ToStream extends SerializerBase
* which is exiting older behavior.
*/
private boolean m_expandDTDEntities = true;
-
+
+ /**
+ * Traditionally, we handled Surrogate Character Pairs by looking
+ * ahead in the input buffer. This could fail if, eg, the pair crossed
+ * between one call to characters() and the next, which can happen
+ * since SAX providers are free to manage buffering as they see fit
+ * and what the XML Data Model considers a single block of text
+ * may be delivered in multiple calls.
+ *
+ * The more robust solution is to maintain state, setting the High
+ * UTF16 Surrogate character aside and processing it when the Low
+ * Surrogate arrives.
+ *
+ * However, handling this robustly this requires recognizing, and
+ * handling, cases where a Surrogate appears but is not adjacent to
+ * the other half of the pair. That's illegal UTF16, but as utility
+ * code we can't guarantee some caller won't attempt it.
+ *
+ * Historically, we have handled this one of two ways, either
+ * generating an IOException with ER_INVALID_UTF18_SURROGATE or
+ * outputting the bad surrogate as a Numeric Character Reference
+ * (and possibly issuing a message to stderr, as in ToTextStream).
+ * The inconsistency annoys me a bit. Only SGML-based formats
+ * support NCRs, and XML explicitly says that even an NCR may not
+ * represent an isolated surrogate. Hence, for correctness, we AT
+ * LEAST want the stderr message, and arguably should be throwing
+ * the exception. However, if we change any of this behavior we
+ * want to be able to revert to the prior response, in case some
+ * user is actually expecting to see that.
+ *
+ * Note that since we process char arrays, the "pending high surrogate"
+ * buffer is a char, with 0 used to indicate "empty buffer".
+ */
+ private char m_pendingUTF16HighSurrogate = 0;
/**
* Default constructor
@@ -959,7 +993,7 @@ abstract public class ToStream extends SerializerBase
/**
* Once a surrogate has been detected, write out the pair of
* characters if it is in the encoding, or if there is no
- * encoding, otherwise write out an entity reference
+ * encoding, otherwise write out an numeric character reference
* of the value of the unicode code point of the character
* represented by the high/low surrogate pair.
* <p>
@@ -967,59 +1001,61 @@ abstract public class ToStream extends SerializerBase
* because the array ends unexpectely, or if the low char is there
* but its value is such that it is not a low surrogate.
*
- * @param c the first (high) part of the surrogate, which
+ * @param high the first (high) part of the surrogate, which
* must be confirmed before calling this method.
* @param ch Character array.
* @param i position Where the surrogate was detected.
* @param end The end index of the significant characters.
* @return 0 if the pair of characters was written out as-is,
* the unicode code point of the character represented by
- * the surrogate pair if an entity reference with that value
+ * the surrogate pair if a numeric char ref with that value
* was written out.
*
* @throws IOException if invalid UTF-16 surrogate detected.
*/
- protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
+ protected int writeUTF16Surrogate(final char high, char ch[], int i, int end)
throws IOException
{
- int codePoint = 0;
+ // THROWS if surrogate pair crosses input buffers
+ // Should probably handle this better.
if (i + 1 >= end)
{
throw new IOException(
Utils.messages.createMessage(
MsgKey.ER_INVALID_UTF16_SURROGATE,
- new Object[] { Integer.toHexString((int) c)}));
+ new Object[] { Integer.toHexString((int) high)}));
}
- final char high = c;
final char low = ch[i+1];
if (!Encodings.isLowUTF16Surrogate(low)) {
throw new IOException(
Utils.messages.createMessage(
MsgKey.ER_INVALID_UTF16_SURROGATE,
new Object[] {
- Integer.toHexString((int) c)
+ Integer.toHexString((int) high)
+ " "
+ Integer.toHexString(low)}));
}
final java.io.Writer writer = m_writer;
+ int codePoint = 0; // Nonzero iff written as NCR
// If we make it to here we have a valid high, low surrogate pair
- if (m_encodingInfo.isInEncoding(c,low)) {
+ if (m_encodingInfo.isInEncoding(high,low)) {
// If the character formed by the surrogate pair
// is in the encoding, so just write it out
+ // NOTE: Assumes same buffer
writer.write(ch,i,2);
}
else {
// Don't know what to do with this char, it is
// not in the encoding and not a high char in
- // a surrogate pair, so write out as an entity ref
+ // a surrogate pair, so write out as a numeric char ref
final String encoding = getEncoding();
if (encoding != null) {
/* The output encoding is known,
* so somthing is wrong.
- */
+ */
codePoint = Encodings.toCodePoint(high, low);
// not in the encoding, so write out a character reference
writer.write('&');
@@ -1033,7 +1069,10 @@ abstract public class ToStream extends SerializerBase
writer.write(ch, i, 2);
}
}
- // non-zero only if character reference was written out.
+
+ // ToTextStream tests this and issues an error message (but
+ // not exception) if the not-in-encoding case arises,
+ // outputting an NCR in passing.
return codePoint;
}
diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java
index cf22d68a..c64dbebd 100644
--- a/serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java
+++ b/serializer/src/main/java/org/apache/xml/serializer/ToTextStream.java
@@ -291,6 +291,7 @@ public class ToTextStream extends ToStream
if (codePoint != 0) {
// I think we can just emit the message,
// not crash and burn.
+ // Git commit ffb244aaa0f88368a0bf483bddc7e74d8a4d83bf?
final String integralValue = Integer.toString(codePoint);
final String msg = Utils.messages.createMessage(
MsgKey.ER_ILLEGAL_CHARACTER,
@@ -306,7 +307,7 @@ public class ToTextStream extends ToStream
} else {
// Don't know what to do with this char, it is
// not in the encoding and not a high char in
- // a surrogate pair, so write out as an entity ref
+ // a surrogate pair, so write out as numeric char ref
if (encoding != null) {
/* The output encoding is known,
* so somthing is wrong.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@xalan.apache.org
For additional commands, e-mail: commits-help@xalan.apache.org