You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by jk...@apache.org on 2024/02/21 19:51:56 UTC
(xalan-java) 01/02: refactoring

This is an automated email from the ASF dual-hosted git repository.

jkesselm pushed a commit to branch XALANJ-2725
in repository https://gitbox.apache.org/repos/asf/xalan-java.git

commit 856e896e42bc409e730ed5de0c1e5cd416b8bbc7
Author: kubycsolutions <ke...@kubyc.solutions>
AuthorDate: Mon Feb 19 17:03:53 2024 -0500

    refactoring
---
 .../java/org/apache/xml/serializer/ToStream.java   | 46 ++++++++++++++++++----
 1 file changed, 38 insertions(+), 8 deletions(-)

diff --git a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
index 8619d61c..0fde86c4 100644
--- a/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
+++ b/serializer/src/main/java/org/apache/xml/serializer/ToStream.java
@@ -1027,6 +1027,34 @@ abstract public class ToStream extends SerializerBase
         }
         
         final char low = ch[i+1];
+	return writeUTF16Surrogate(high, low);
+    }
+	
+
+    /**
+     * Once a surrogate has been detected, write out the pair of
+     * characters if it is in the encoding, or if there is no
+     * encoding, otherwise write out an numeric character reference
+     * of the value of the unicode code point of the character
+     * represented by the high/low surrogate pair.
+     * <p>
+     * An exception is thrown if there is no low surrogate in the pair,
+     * because the array ends unexpectely, or if the low char is there
+     * but its value is such that it is not a low surrogate.
+     *
+     * @param high the first (high) part of the surrogate, which
+     * must be confirmed before calling this method.
+     * @param low the second (low) part of the presumed surrogate
+     * @return 0 if the pair of characters was written out as-is,
+     * or the unicode code point of the character represented by
+     * the surrogate pair if a numeric char ref with that value
+     * was written out. (REVIEW: Is this needed?)
+     * 
+     * @throws IOException if  invalid UTF-16 surrogate detected.
+     */
+    protected int writeUTF16Surrogate(final char high, final char low)
+        throws IOException
+    {
         if (!Encodings.isLowUTF16Surrogate(low)) {
             throw new IOException(
                 Utils.messages.createMessage(
@@ -1038,14 +1066,15 @@ abstract public class ToStream extends SerializerBase
         }
 
         final java.io.Writer writer = m_writer;
-        int codePoint = 0; // Nonzero iff written as NCR
+        int codePoint = 0; // Nonzero iff written as NCR. REVIEW: Needed?
                 
         // If we make it to here we have a valid high, low surrogate pair
         if (m_encodingInfo.isInEncoding(high,low)) {
             // If the character formed by the surrogate pair
             // is in the encoding, so just write it out
 	    // NOTE: Assumes same buffer
-            writer.write(ch,i,2);
+            writer.write(high);
+            writer.write(low);
         }
         else {
             // Don't know what to do with this char, it is
@@ -1053,20 +1082,21 @@ abstract public class ToStream extends SerializerBase
             // a surrogate pair, so write out as a numeric char ref
             final String encoding = getEncoding();
             if (encoding != null) {
-                /* The output encoding is known, 
-                 * so somthing is wrong.
+                /* The output encoding is known but does not include
+                 * this character. Fallback: Write as NCR
                  */
                 codePoint = Encodings.toCodePoint(high, low);
-                // not in the encoding, so write out a character reference
                 writer.write('&');
                 writer.write('#');
                 writer.write(Integer.toString(codePoint));
                 writer.write(';');
             } else {
-                /* The output encoding is not known,
-                 * so just write it out as-is.
+                /* The output encoding is not known, so presume
+                 * Unicode and just write it out. This handles the
+                 * case of serializing to a character buffer.
                  */
-                writer.write(ch, i, 2);
+		writer.write(high);
+		writer.write(low);
             }
         }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@xalan.apache.org
For additional commands, e-mail: commits-help@xalan.apache.org