You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@myfaces.apache.org by mm...@apache.org on 2005/11/10 17:49:37 UTC
svn commit: r332326 - in /myfaces/tomahawk/trunk/src:
java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java
test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java
Author: mmarinschek
Date: Thu Nov 10 08:49:23 2005
New Revision: 332326
URL: http://svn.apache.org/viewcvs?rev=332326&view=rev
Log:
fix for MYFACES-768. Thanks to Simon Kitching.
Modified:
myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java
myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java
Modified: myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java
URL: http://svn.apache.org/viewcvs/myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java?rev=332326&r1=332325&r2=332326&view=diff
==============================================================================
--- myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java (original)
+++ myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java Thu Nov 10 08:49:23 2005
@@ -27,6 +27,9 @@
* earlier than the tag occurred (particularly into the document HEAD section).
* This can only be implemented by buffering the response and post-processing
* it to find the relevant HTML tags and modifying the buffer as needed.
+ * <p>
+ * This class tries to do the parsing as quickly as possible; many of the
+ * details of HTML are not relevant for the purposes this class is used for.
*
* @version $Revision$ $Date$
*/
@@ -47,6 +50,8 @@
private static final int STATE_READY = 0;
private static final int STATE_IN_COMMENT = 1;
private static final int STATE_IN_TAG = 2;
+ private static final int STATE_IN_MARKED_SECTION = 3;
+ private static final int STATE_EXPECTING_ETAGO = 4;
private int offset;
private int lineNumber;
@@ -211,11 +216,10 @@
// TODO: should we consider a string to be terminated by a newline?
// that would help with runaway strings but I think that multiline
// strings *are* allowed...
-
- //
- // TODO: detect newlines within strings and increment lineNumber.
- // This isn't so important, though; they aren't common and being a
- // few lines out in an error message isn't serious either.
+ //
+ // TODO: detect newlines within strings and increment lineNumber.
+ // This isn't so important, though; they aren't common and being a
+ // few lines out in an error message isn't serious either.
StringBuffer stringBuf = new StringBuffer();
boolean escaping = false;
while (!isFinished()) {
@@ -271,7 +275,7 @@
* @param s is a set of characters that should not be discarded.
*/
void consumeExcept(String s) {
- boolean crSeen = false;
+ boolean crSeen = false;
while (offset < seq.length()) {
char c = seq.charAt(offset);
@@ -290,6 +294,16 @@
crSeen = false;
}
+ // Track line number for error messages.
+ if (c == '\r') {
+ ++lineNumber;
+ crSeen = true;
+ } else if ((c == '\n') && !crSeen) {
+ ++lineNumber;
+ } else {
+ crSeen = false;
+ }
+
++offset;
}
}
@@ -306,8 +320,19 @@
lineNumber = 1;
offset = 0;
+ int lastOffset = offset -1;
while (offset < seq.length())
{
+ // Sanity check; each pass through this loop must increase the offset.
+ // Failure to do this means a hang situation has occurred.
+ if (offset <= lastOffset)
+ {
+ // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser");
+ log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped");
+ //return;
+ }
+ lastOffset = offset;
+
if (state == STATE_READY) {
// in this state, nothing but "<" has any significance
consumeExcept("<");
@@ -316,30 +341,38 @@
}
if (consumeMatch("<!--")) {
- // VERIFY: can "< ! --" start a comment?
+ // Note that whitespace is *not* permitted in <!--
state = STATE_IN_COMMENT;
- } else if (consumeMatch("<!")) {
- // xml processing instruction or <!DOCTYPE> tag
- // we don't need to actually do anything here
- log.debug("PI found at line " + getCurrentLineNumber());
+ } else if (consumeMatch("<![")) {
+ // Start of a "marked section", eg "<![CDATA" or
+ // "<![INCLUDE" or "<![IGNORE". These always terminate
+ // with "]]>"
+ log.debug("Marked section found at line " + getCurrentLineNumber());
+ state = STATE_IN_MARKED_SECTION;
+ } else if (consumeMatch("<!DOCTYPE")) {
+ log.debug("DOCTYPE found at line " + getCurrentLineNumber());
+ // we don't need to actually do anything here; the
+ // tag can't contain a bare "<", so the first "<"
+ // indicates the start of the next real tag.
+ //
+ // TODO: Handle case where the DOCTYPE includes an internal DTD. In
+ // that case there *will* be embedded < chars in the document. However
+ // that's very unlikely to be used in a JSF page, so this is pretty low
+ // priority.
+ } else if (consumeMatch("<?")) {
+ // xml processing instruction or <!DOCTYPE> tag
+ // we don't need to actually do anything here; the
+ // tag can't contain a bare "<", so the first "<"
+ // indicates the start of the next real tag.
+ log.debug("PI found at line " + getCurrentLineNumber());
} else if (consumeMatch("</")) {
- // VERIFY: is "< / foo >" a valid end-tag?
-
- int tagStart = offset - 2;
- String tagName = consumeElementName();
- consumeWhitespace();
- if (!consumeMatch(">")) {
- throw new Error("Malformed end tag");
+ if (!processEndTag()) {
+ // message already logged
+ return;
}
- // We can't verify that the tag names balance because this is HTML
- // we are processing, not XML.
-
// stay in state READY
state = STATE_READY;
-
- // inform user that the tag has been closed
- closedTag(tagStart, offset, tagName);
} else if (consumeMatch("<")) {
// We can't tell the user that the tag has closed until after we have
// processed any attributes and found the real end of the tag. So save
@@ -363,9 +396,10 @@
}
if (state == STATE_IN_COMMENT) {
- // VERIFY: does "-- >" close a comment?
+ // TODO: handle "-- >", which is a valid way to close a
+ // comment according to the specs.
- // in this state, nothing but "-->" has any significance
+ // in this state, nothing but "--" has any significance
consumeExcept("-");
if (isFinished()) {
break;
@@ -393,8 +427,17 @@
currentTagStart = -1;
currentTagName = null;
} else if (consumeMatch(">")) {
+ if (currentTagName.equalsIgnoreCase("script")
+ || currentTagName.equalsIgnoreCase("style")) {
+ // We've just started a special tag which can contain anything except
+ // the ETAGO marker ("</"). See
+ // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
+ state = STATE_EXPECTING_ETAGO;
+ } else {
+ state = STATE_READY;
+ }
+
// end of open tag, but not end of element
- state = STATE_READY;
openedTag(currentTagStart, offset, currentTagName);
// and reset vars just in case...
@@ -403,17 +446,94 @@
} else {
// xml attribute
String attrName = consumeAttrName();
- consumeWhitespace();
-
- // html can have "stand-alone" attributes with no following equals sign
- if (consumeMatch("=")) {
- String attrValue = consumeAttrValue();
+ if (attrName == null) {
+ // Oops, we found something quite unexpected in this tag.
+ // The best we can do is probably to drop back to looking
+ // for "/>", though that does risk us misinterpreting the
+ // contents of an attribute's associated string value.
+ log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'"
+ + " at line " + getCurrentLineNumber());
+ state = STATE_EXPECTING_ETAGO;
+ // and consume one character
+ ++offset;
+ } else {
+ consumeWhitespace();
+
+ // html can have "stand-alone" attributes with no following equals sign
+ if (consumeMatch("=")) {
+ String attrValue = consumeAttrValue();
+ }
}
}
continue;
}
+
+ if (state == STATE_IN_MARKED_SECTION) {
+ // in this state, nothing but "]]>" has any significance
+ consumeExcept("]");
+ if (isFinished()) {
+ break;
+ }
+
+ if (consumeMatch("]]>")) {
+ state = STATE_READY;
+ } else {
+ // false call; ] is not end of cdata section
+ consumeMatch("]");
+ }
+
+ continue;
+ }
+
+ if (state == STATE_EXPECTING_ETAGO) {
+ // The term "ETAGO" is the official spec term for "</".
+ consumeExcept("<");
+ if (isFinished()) {
+ log.debug("Malformed input page; input terminated while tag not closed.");
+ break;
+ }
+
+ if (consumeMatch("</")) {
+ if (!processEndTag()) {
+ return;
+ }
+ state = STATE_READY;
+ } else {
+ // false call; < does not start an ETAGO
+ consumeMatch("<");
+ }
+
+ continue;
+ }
+ }
+ }
+
+ /**
+ * Invoked when "</" has been seen in the input, this method
+ * handles the parsing of the end tag and the invocation of the
+ * appropriate callback method.
+ *
+ * @return true if the tag was successfully parsed, and false
+ * if there was a fatal parsing error.
+ */
+ private boolean processEndTag() {
+ int tagStart = offset - 2;
+ String tagName = consumeElementName();
+ consumeWhitespace();
+ if (!consumeMatch(">")) {
+ log.error("Malformed end tag at line " + getCurrentLineNumber()
+ + "; skipping parsing");
+ return false;
}
+
+
+ // inform user that the tag has been closed
+ closedTag(tagStart, offset, tagName);
+
+ // We can't verify that the tag names balance because this is HTML
+ // we are processing, not XML.
+ return true;
}
/**
Modified: myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java
URL: http://svn.apache.org/viewcvs/myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java?rev=332326&r1=332325&r2=332326&view=diff
==============================================================================
--- myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java (original)
+++ myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java Thu Nov 10 08:49:23 2005
@@ -29,20 +29,24 @@
{
public static class ParseCallbackListener implements CallbackListener
{
- // records the offset immedately after <head>
- int headerInsertPosition = -1;
-
- // records the offset immediately after <body>
- int bodyInsertPosition = -1;
-
- // records the offset immediately before <body>
- int beforeBodyPosition = -1;
+ int beforeHeadStart = -1;
+ int afterHeadStart = -1;
+ int beforeHeadEnd = -1;
+ int afterHeadEnd = -1;
+ int beforeBodyStart = -1;
+ int afterBodyStart = -1;
+ int beforeBodyEnd = -1;
+ int afterBodyEnd = -1;
public void openedStartTag(int charIndex, int tagIdentifier)
{
- if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
+ if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
+ {
+ beforeHeadStart = charIndex;
+ }
+ else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
{
- beforeBodyPosition = charIndex;
+ beforeBodyStart = charIndex;
}
}
@@ -50,20 +54,36 @@
{
if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
{
- headerInsertPosition = charIndex;
+ afterHeadStart = charIndex;
}
else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
{
- bodyInsertPosition = charIndex;
+ afterBodyStart = charIndex;
}
}
public void openedEndTag(int charIndex, int tagIdentifier)
{
+ if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
+ {
+ beforeHeadEnd = charIndex;
+ }
+ else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
+ {
+ beforeBodyEnd = charIndex;
+ }
}
public void closedEndTag(int charIndex, int tagIdentifier)
{
+ if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
+ {
+ afterHeadEnd = charIndex;
+ }
+ else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
+ {
+ afterBodyEnd = charIndex;
+ }
}
public void attribute(int charIndex, int tagIdentifier, String key, String value)
@@ -191,7 +211,8 @@
assertFalse("Match non-matching pattern", parser.consumeMatch("aa"));
}
- public void testConsumeElementName() {
+ public void testConsumeElementName()
+ {
CharSequence seq = " foo t:foo t:FooBar t:foo_bar element-name/>";
CallbackListener listener = new ParseCallbackListener();
ReducedHTMLParser parser = new ReducedHTMLParser(seq, listener);
@@ -213,7 +234,8 @@
assertEquals("Element name matched", "element-name", name5);
}
- public void testConsumeStringBasic() {
+ public void testConsumeStringBasic()
+ {
CharSequence s1 = "'string1' \"string2\"";
CallbackListener listener = new ParseCallbackListener();
ReducedHTMLParser parser = new ReducedHTMLParser(s1, listener);
@@ -233,7 +255,8 @@
assertEquals("String correctly parsed", "string2", str2);
}
- public void testConsumeStringEscapedQuote() {
+ public void testConsumeStringEscapedQuote()
+ {
char quoteMark = '\'';
// build literal sequence 'don\'t quote me' not-in-the-string
@@ -254,7 +277,8 @@
assertEquals("String correctly parsed", "don't quote me", str1);
}
- public void testConsumeStringEscapedEscape() {
+ public void testConsumeStringEscapedEscape()
+ {
char quoteMark = '\'';
char backSlash = '\\';
@@ -281,7 +305,8 @@
assertEquals("String correctly parsed", "don" + backSlash, str1);
}
- public void testConsumeAttrValue() {
+ public void testConsumeAttrValue()
+ {
CharSequence seq = " bare 'quoted 1' \"quoted 2\" bare2 ";
CallbackListener listener = new ParseCallbackListener();
ReducedHTMLParser parser = new ReducedHTMLParser(seq, listener);
@@ -299,7 +324,8 @@
assertEquals("Attr value matched", "bare2", val4);
}
- public void testConsumeExcept() {
+ public void testConsumeExcept()
+ {
CharSequence seq = "abc$$#dd ee#ff-gghh ii";
CallbackListener listener = new ParseCallbackListener();
ReducedHTMLParser parser = new ReducedHTMLParser(seq, listener);
@@ -322,8 +348,104 @@
parser.consumeExcept("z");
}
+ // test parsing completes when a lessthan is not followed by an element name,
+ // and there is just whitespace up to end of the input.
+ public void testParseBadTagNoElementName1()
+ {
+ String s = "xxxx \n\n <# \n\n";
+ CallbackListener listener = new ParseCallbackListener();
+ ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+
+ parser.parse();
+ assertTrue(parser.isFinished());
+ }
+
+ // test parsing completes when a lessthan is not followed by an element name,
+ public void testParseBadTagNoElementName2()
+ {
+ String s = "xxxx \n\n <# \n\n hi there";
+ CallbackListener listener = new ParseCallbackListener();
+ ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+
+ parser.parse();
+ assertTrue(parser.isFinished());
+ }
+
+ // test parsing completes when an invalid char is found where an attribute name
+ // is expected.
+ public void testParseBadTagInvalidAttributeName()
+ {
+ String s = "<foo )/>";
+ CallbackListener listener = new ParseCallbackListener();
+ ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+
+ parser.parse();
+ assertTrue(parser.isFinished());
+ }
+
+ // test CDATA sections are handled
+ public void testParseCDATA()
+ {
+ String s = "xx<head> <![CDATA[ <head> ]]> <body>";
+ ParseCallbackListener listener = new ParseCallbackListener();
+ ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+
+ parser.parse();
+ assertTrue(parser.isFinished());
+ assertEquals("CDATA works", 8, listener.afterHeadStart);
+ assertEquals("CDATA works", 30, listener.beforeBodyStart);
+ }
+
+ // test PI sections are handled
+ public void testParsePI()
+ {
+ String s = "<?xml version=\"1.0\"?> xx<head> ";
+ ParseCallbackListener listener = new ParseCallbackListener();
+ ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+
+ parser.parse();
+ assertTrue(parser.isFinished());
+ assertEquals("PI works", 30, listener.afterHeadStart);
+ }
+
+ // Test script element support; the spec states that a <script> or
+ // <style> tag can contain anything except "/>"
+ public void testScript()
+ {
+ String s1 = "<head>";
+ String s2 = "<script type='text/javascript'>"
+ + "if (1<2) alert('foo');\n"
+ + "if (1>2) alert('bar');\n"
+ + "</script>";
+ String s3 = "</head>";
+ String s4 = "<body>";
+ String s5 = "</body>";
+
+ StringBuffer buf = new StringBuffer();
+ buf.append(s1);
+ buf.append(s2);
+ buf.append(s3);
+ buf.append(s4);
+ buf.append(s5);
+
+ ParseCallbackListener listener = new ParseCallbackListener();
+ ReducedHTMLParser parser = new ReducedHTMLParser(buf.toString(), listener);
+
+ parser.parse();
+ assertTrue(parser.isFinished());
+ assertEquals("Script works", s1.length(), listener.afterHeadStart);
+ int beforeHeadEnd = s1.length() + s2.length();
+ assertEquals("Script works", beforeHeadEnd, listener.beforeHeadEnd);
+ int beforeBodyStart = beforeHeadEnd + s3.length();
+ assertEquals("Script works", beforeBodyStart, listener.beforeBodyStart);
+ int beforeBodyEnd = beforeBodyStart + s4.length();
+ assertEquals("Script works", beforeBodyEnd, listener.beforeBodyEnd);
+ }
+
// test the full parse method
- public void testParse() {
+ public void testParse()
+ {
+ String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
String s1 = "<html><head>";
String s2 = "\n<!-- a comment --><title>foo</title>";
String s3 = "</head>";
@@ -338,6 +460,7 @@
String s8 = "</body> </html>";
StringBuffer buf = new StringBuffer();
+ buf.append(s0);
buf.append(s1);
buf.append(s2);
buf.append(s3);
@@ -354,13 +477,13 @@
// check that listener has correctly computed the offset to the char just
// before the </head> tag starts.
- int afterHeadPos = s1.length();
- assertEquals("Pos after <head> tag ", afterHeadPos, listener.headerInsertPosition);
+ int afterHeadStart = s0.length() + s1.length();
+ assertEquals("Pos after <head> tag ", afterHeadStart, listener.afterHeadStart);
- int beforeBodyPos = s1.length() + s2.length() + s3.length();
- assertEquals("Pos before <body> tag", beforeBodyPos, listener.beforeBodyPosition);
+ int beforeBodyStart = afterHeadStart + s2.length() + s3.length();
+ assertEquals("Pos before <body> tag", beforeBodyStart, listener.beforeBodyStart);
- int afterBodyPos = s1.length() + s2.length() + s3.length() + s4.length();
- assertEquals("Pos after <body> tag", afterBodyPos, listener.bodyInsertPosition);
+ int afterBodyStart = beforeBodyStart + s4.length();
+ assertEquals("Pos after <body> tag", afterBodyStart, listener.afterBodyStart);
}
}