You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xalan.apache.org by sb...@locus.apache.org on 2000/12/13 05:11:43 UTC
cvs commit: xml-xalan/java/src/org/apache/xalan/serialize FormatterToHTML.java FormatterToXML.java
sboag 00/12/12 20:11:43
Modified: java/src/org/apache/xalan/serialize FormatterToHTML.java
FormatterToXML.java
Log:
Rewrote writeAttrURI to encode UTF-16 to UTF-8 when escaping
characters. Subject to code review, re message I sent to the list.
Also consolodated surrogate pair handling in the normal character
handling.
Revision Changes Path
1.10 +150 -66 xml-xalan/java/src/org/apache/xalan/serialize/FormatterToHTML.java
Index: FormatterToHTML.java
===================================================================
RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/serialize/FormatterToHTML.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- FormatterToHTML.java 2000/12/07 08:16:11 1.9
+++ FormatterToHTML.java 2000/12/13 04:11:43 1.10
@@ -64,11 +64,13 @@
import java.util.Hashtable;
import java.util.Vector;
import java.util.Properties;
+import java.util.BitSet;
import org.xml.sax.*;
import org.apache.xml.utils.BoolStack;
import org.apache.xml.utils.Trie;
+import org.apache.xml.utils.FastStringBuffer;
import org.apache.xalan.res.XSLMessages;
import org.apache.xpath.res.XPATHErrorResources;
import org.apache.xml.utils.StringToIntTable;
@@ -720,62 +722,149 @@
{
this.accum(name);
this.accum('=');
- this.accum('\"');
+ this.accum('\"');
if (elemDesc.isAttrFlagSet(name, ElemDesc.ATTRURL))
- writeAttrURI(value, this.m_encoding);
+ writeAttrURI(value, m_specialEscapeURLs);
else
writeAttrString(value, this.m_encoding);
-
this.accum('\"');
+
}
}
-
- /** Mask for high byte. */
- static final int MASK1 = 0xFF00;
-
- /** Mask for low byte. */
- static final int MASK2 = 0x00FF;
+
+ /**
+ * Tell if a character is an ASCII digit.
+ */
+ private boolean isASCIIDigit(char c)
+ {
+ return (c >= '0' && c <= '9');
+ }
/**
* Write the specified <var>string</var> after substituting non ASCII characters,
* with <CODE>%HH</CODE>, where HH is the hex of the byte value.
*
* @param string String to convert to XML format.
- * @param specials Chracters, should be represeted in chracter referenfces.
- * @param encoding CURRENTLY NOT IMPLEMENTED.
+ * @param doURLEscaping True if we should try to encode as
+ * per http://www.ietf.org/rfc/rfc2396.txt.
* @see #backReference
*
- * @throws org.xml.sax.SAXException
+ * @throws org.xml.sax.SAXException if a bad surrogate pair is detected.
*/
- public void writeAttrURI(String string, String encoding)
+ public void writeAttrURI(String string, boolean doURLEscaping)
throws org.xml.sax.SAXException
{
+ // http://www.ietf.org/rfc/rfc2396.txt says:
+ // A URI is always in an "escaped" form, since escaping or unescaping a
+ // completed URI might change its semantics. Normally, the only time
+ // escape encodings can safely be made is when the URI is being created
+ // from its component parts; each component may have its own set of
+ // characters that are reserved, so only the mechanism responsible for
+ // generating or interpreting that component can determine whether or
+ // not escaping a character will change its semantics. Likewise, a URI
+ // must be separated into its components before the escaped characters
+ // within those components can be safely decoded.
+ //
+ // ...So we do our best to do limited escaping of the URL, without
+ // causing damage. If the URL is already properly escaped, in theory, this
+ // function should not change the string value.
char[] stringArray = string.toCharArray();
int len = stringArray.length;
-
+
for (int i = 0; i < len; i++)
{
char ch = stringArray[i];
- // if first 8 bytes are 0, no need to append them.
- if ((ch < 9) || (ch > 127)
- || /*(ch == '"') || -sb, as per #PDIK4L9LZY */ (ch == ' '))
+ if ((ch < 33) || (ch > 126))
{
- if (m_specialEscapeURLs)
+ if (doURLEscaping)
{
- int b1 = (int) ((((int) ch) & MASK1) >> 8);
- int b2 = (int) (((int) ch) & MASK2);
-
- if (b1 != 0)
+ // Encode UTF16 to UTF8.
+ // Reference is Unicode, A Primer, by Tony Graham.
+ // Page 92.
+
+ if(ch <= 0x7F)
+ {
+ accum('%');
+ accum(Integer.toHexString(ch).toUpperCase());
+ }
+ else if(ch <= 0x7FF)
+ {
+ // Clear low 6 bits before rotate, put high 4 bits in low byte,
+ // and set two high bits.
+ int high = (ch >> 6) | 0xC0;
+ int low = (ch & 0x3F) | 0x80; // First 6 bits, + high bit
+ accum('%');
+ accum(Integer.toHexString(high).toUpperCase());
+ accum('%');
+ accum(Integer.toHexString(low).toUpperCase());
+ }
+ else if( isUTF16Surrogate(ch) ) // high surrogate
{
- accum("%");
- accum(Integer.toHexString(b1));
+ // I'm sure this can be done in 3 instructions, but I choose
+ // to try and do it exactly like it is done in the book, at least
+ // until we are sure this is totally clean. I don't think performance
+ // is a big issue with this particular function, though I could be
+ // wrong. Also, the stuff below clearly does more masking than
+ // it needs to do.
+
+ // Clear high 6 bits.
+ int highSurrogate = ((int) ch) & 0x03FF;
+
+ // Middle 4 bits (wwww) + 1
+ // "Note that the value of wwww from the high surrogate bit pattern
+ // is incremented to make the uuuuu bit pattern in the scalar value
+ // so the surrogate pair don't address the BMP."
+ int wwww = ((highSurrogate & 0x03C0) >> 6);
+ int uuuuu = wwww+1;
+
+ // next 4 bits
+ int zzzz = (highSurrogate & 0x003C) >> 2;
+
+ // low 2 bits
+ int yyyyyy = ((highSurrogate & 0x0003) << 4) & 0x30;
+
+ // Get low surrogate character.
+ ch = stringArray[++i];
+
+ // Clear high 6 bits.
+ int lowSurrogate = ((int) ch) & 0x03FF;
+
+ // put the middle 4 bits into the bottom of yyyyyy (byte 3)
+ yyyyyy = yyyyyy | ((lowSurrogate & 0x03C0) >> 6);
+
+ // bottom 6 bits.
+ int xxxxxx = (lowSurrogate & 0x003F);
+
+ int byte1 = 0xF0 | (uuuuu >> 2); // top 3 bits of uuuuu
+ int byte2 = 0x80 | (((uuuuu & 0x03) << 4) & 0x30) | zzzz;
+ int byte3 = 0x80 | yyyyyy;
+ int byte4 = 0x80 | xxxxxx;
+
+ accum('%');
+ accum(Integer.toHexString(byte1).toUpperCase());
+ accum('%');
+ accum(Integer.toHexString(byte2).toUpperCase());
+ accum('%');
+ accum(Integer.toHexString(byte3).toUpperCase());
+ accum('%');
+ accum(Integer.toHexString(byte4).toUpperCase());
}
+ else
+ {
+ int high = (ch >> 12) | 0xE0; // top 4 bits
+ int middle = ((ch & 0x0FC0) >> 6) | 0x80; // middle 6 bits
+ int low = (ch & 0x3F) | 0x80; // First 6 bits, + high bit
+ accum('%');
+ accum(Integer.toHexString(high).toUpperCase());
+ accum('%');
+ accum(Integer.toHexString(middle).toUpperCase());
+ accum('%');
+ accum(Integer.toHexString(low).toUpperCase());
+ }
- accum("%");
- accum(Integer.toHexString(b2));
}
else if (ch < m_maxCharacter)
{
@@ -788,20 +877,44 @@
accum(';');
}
}
+ else if('%' == ch)
+ {
+ // If the character is a '%' number number, try to avoid double-escaping.
+ // There is a question if this is legal behavior.
+ if(((i+2) < len) && isASCIIDigit(stringArray[i+1])
+ && isASCIIDigit(stringArray[i+2]))
+ {
+ accum(ch);
+ }
+ else
+ {
+ if (doURLEscaping)
+ {
+ accum('%');
+ accum(Integer.toHexString(ch).toUpperCase());
+ }
+ else
+ accum(ch);
+ }
+
+ }
+ // Since http://www.ietf.org/rfc/rfc2396.txt refers to the URI grammar as
+ // not allowing quotes in the URI proper syntax, nor in the fragment
+ // identifier, we believe that it's OK to double escape quotes.
else if (ch == '"')
{
- accum('&');
- accum('q');
- accum('u');
- accum('o');
- accum('t');
- accum(';');
+ // Mike Kay encodes this as ", so he may know something I don't?
+ if (doURLEscaping)
+ accum("%22");
+ else
+ accum("""); // we have to escape this, I guess.
}
else
{
accum(ch);
}
}
+
}
/**
@@ -851,45 +964,16 @@
}
else
{
- if (0xd800 <= ch && ch < 0xdc00)
+ if (isUTF16Surrogate(ch))
{
-
- // UTF-16 surrogate
- int next;
-
- if (i + 1 >= strLen)
+ try
{
- throw new org.xml.sax.SAXException(
- XSLMessages.createXPATHMessage(
- XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
- new Object[]{ Integer.toHexString(ch) })); //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString(ch)+ " ?");
+ i = writeUTF16Surrogate(ch, chars, i, strLen);
}
- else
+ catch(IOException ioe)
{
- next = chars[++i];
-
- if (!(0xdc00 <= next && next < 0xe000))
- throw new org.xml.sax.SAXException(
- XSLMessages.createXPATHMessage(
- XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
- new Object[]{
- Integer.toHexString(ch) + " "
- + Integer.toHexString(next) })); //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString(ch)+" "+Integer.toHexString(next));
- next = ((ch - 0xd800) << 10) + next - 0xdc00 + 0x00010000;
+ throw new SAXException(ioe);
}
-
- accum("&#");
- accum(Integer.toString(next));
- accum(';');
-
- /*} else if (null != ctbc && !ctbc.canConvert(ch)) {
- accum("&#x");
- accum(Integer.toString((int)ch, 16));
- accum(";");*/
}
// The next is kind of a hack to keep from escaping in the case
1.10 +83 -68 xml-xalan/java/src/org/apache/xalan/serialize/FormatterToXML.java
Index: FormatterToXML.java
===================================================================
RCS file: /home/cvs/xml-xalan/java/src/org/apache/xalan/serialize/FormatterToXML.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -r1.9 -r1.10
--- FormatterToXML.java 2000/12/09 22:25:27 1.9
+++ FormatterToXML.java 2000/12/13 04:11:43 1.10
@@ -1557,8 +1557,87 @@
accum(ch, start, length);
}
+
+ /**
+ * Return true if the character is the high member of a surrogate pair.
+ */
+ static final boolean isUTF16Surrogate(char c)
+ {
+ return (c & 0xFC00) == 0xD800;
+ }
+
+ /**
+ * Once a surrogate has been detected, get the pair as a single
+ * integer value.
+ *
+ * @param c the first part of the surrogate.
+ * @param ch Character array.
+ * @param i position Where the surrogate was detected.
+ * @param end The end index of the significant characters.
+ * @return i+1.
+ * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
+ */
+ int getURF16SurrogateValue(char c, char ch[], int i, int end)
+ throws org.xml.sax.SAXException
+ {
+ int next;
+ if (i + 1 >= end)
+ {
+ throw new org.xml.sax.SAXException(
+ XSLMessages.createXPATHMessage(
+ XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
+ new Object[]{ Integer.toHexString((int) c) })); //"Invalid UTF-16 surrogate detected: "
+
+ //+Integer.toHexString((int)c)+ " ?");
+ }
+ else
+ {
+ next = ch[++i];
+ if (!(0xdc00 <= next && next < 0xe000))
+ throw new org.xml.sax.SAXException(
+ XSLMessages.createXPATHMessage(
+ XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
+ new Object[]{
+ Integer.toHexString((int) c) + " "
+ + Integer.toHexString(next) })); //"Invalid UTF-16 surrogate detected: "
+
+ //+Integer.toHexString((int)c)+" "+Integer.toHexString(next));
+ next = ((c - 0xd800) << 10) + next - 0xdc00 + 0x00010000;
+ }
+ return next;
+ }
+
/**
+ * Once a surrogate has been detected, write the pair as a single
+ * character reference.
+ *
+ * @param c the first part of the surrogate.
+ * @param ch Character array.
+ * @param i position Where the surrogate was detected.
+ * @param end The end index of the significant characters.
+ * @return i+1.
+ * @throws IOException
+ * @throws org.xml.sax.SAXException if invalid UTF-16 surrogate detected.
+ */
+ protected int writeUTF16Surrogate(char c, char ch[], int i, int end)
+ throws IOException, org.xml.sax.SAXException
+ {
+ // UTF-16 surrogate
+ int surrogateValue = getURF16SurrogateValue(c, ch, i, end);
+ i++;
+
+ accum('&');
+ accum('#');
+
+ // accum('x');
+ accum(Integer.toString(surrogateValue));
+ accum(';');
+
+ return i;
+ }
+
+ /**
* Normalize the characters, but don't escape.
*
* @param ch The characters from the XML document.
@@ -1589,43 +1668,9 @@
accum("]]>");
// This needs to go into a function...
- if (0xd800 <= ((int) c) && ((int) c) < 0xdc00)
+ if (isUTF16Surrogate(c))
{
-
- // UTF-16 surrogate
- int next;
-
- if (i + 1 >= end)
- {
- throw new org.xml.sax.SAXException(
- XSLMessages.createXPATHMessage(
- XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
- new Object[]{ Integer.toHexString((int) c) })); //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString((int)c)+ " ?");
- }
- else
- {
- next = ch[++i];
-
- if (!(0xdc00 <= next && next < 0xe000))
- throw new org.xml.sax.SAXException(
- XSLMessages.createXPATHMessage(
- XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
- new Object[]{
- Integer.toHexString((int) c) + " "
- + Integer.toHexString(next) })); //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString((int)c)+" "+Integer.toHexString(next));
- next = ((c - 0xd800) << 10) + next - 0xdc00 + 0x00010000;
- }
-
- accum('&');
- accum('#');
-
- // accum('x');
- accum(Integer.toString(next));
- accum(';');
+ i = writeUTF16Surrogate(c, ch, i, end);
}
else
{
@@ -1656,40 +1701,10 @@
}
// This needs to go into a function...
- else if (0xd800 <= ((int) c) && ((int) c) < 0xdc00)
+ else if (isUTF16Surrogate(c))
{
-
- // UTF-16 surrogate
- int next;
- if (i + 1 >= end)
- {
- throw new org.xml.sax.SAXException(
- XSLMessages.createXPATHMessage(
- XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
- new Object[]{ Integer.toHexString((int) c) })); //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString((int)c)+ " ?");
- }
- else
- {
- next = ch[++i];
-
- if (!(0xdc00 <= next && next < 0xe000))
- throw new org.xml.sax.SAXException(
- XSLMessages.createXPATHMessage(
- XPATHErrorResources.ER_INVALID_UTF16_SURROGATE,
- new Object[]{
- Integer.toHexString((int) c) + " "
- + Integer.toHexString(next) })); //"Invalid UTF-16 surrogate detected: "
-
- //+Integer.toHexString((int)c)+" "+Integer.toHexString(next));
- next = ((c - 0xd800) << 10) + next - 0xdc00 + 0x00010000;
- }
-
- accum("&#");
- accum(Integer.toString(next));
- accum(";");
+ i = writeUTF16Surrogate(c, ch, i, end);
}
else
{