You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@myfaces.apache.org by mm...@apache.org on 2005/11/10 17:49:37 UTC

svn commit: r332326 - in /myfaces/tomahawk/trunk/src: java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java

Author: mmarinschek
Date: Thu Nov 10 08:49:23 2005
New Revision: 332326

URL: http://svn.apache.org/viewcvs?rev=332326&view=rev
Log:
fix for MYFACES-768. Thanks to Simon Kitching.

Modified:
    myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java
    myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java

Modified: myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java
URL: http://svn.apache.org/viewcvs/myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java?rev=332326&r1=332325&r2=332326&view=diff
==============================================================================
--- myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java (original)
+++ myfaces/tomahawk/trunk/src/java/org/apache/myfaces/component/html/util/ReducedHTMLParser.java Thu Nov 10 08:49:23 2005
@@ -27,6 +27,9 @@
  * earlier than the tag occurred (particularly into the document HEAD section).
  * This can only be implemented by buffering the response and post-processing
  * it to find the relevant HTML tags and modifying the buffer as needed.
+ * <p>
+ * This class tries to do the parsing as quickly as possible; many of the
+ * details of HTML are not relevant for the purposes this class is used for.
  * 
  * @version $Revision$ $Date$
  */
@@ -47,6 +50,8 @@
     private static final int STATE_READY = 0;
     private static final int STATE_IN_COMMENT = 1;
     private static final int STATE_IN_TAG = 2;
+    private static final int STATE_IN_MARKED_SECTION = 3;
+    private static final int STATE_EXPECTING_ETAGO = 4;
     
     private int offset;
     private int lineNumber;
@@ -211,11 +216,10 @@
         // TODO: should we consider a string to be terminated by a newline?
         // that would help with runaway strings but I think that multiline
         // strings *are* allowed...
-
-         //
-         // TODO: detect newlines within strings and increment lineNumber.
-         // This isn't so important, though; they aren't common and being a
-         // few lines out in an error message isn't serious either.
+        //
+        // TODO: detect newlines within strings and increment lineNumber.
+        // This isn't so important, though; they aren't common and being a
+        // few lines out in an error message isn't serious either.
         StringBuffer stringBuf = new StringBuffer();
         boolean escaping = false;
         while (!isFinished()) {
@@ -271,7 +275,7 @@
      * @param s is a set of characters that should not be discarded.
      */
     void consumeExcept(String s) {
-         boolean crSeen = false;
+        boolean crSeen = false;
 
         while (offset < seq.length()) {
             char c = seq.charAt(offset);
@@ -290,6 +294,16 @@
                  crSeen = false;
              }
             
+            // Track line number for error messages.
+            if (c == '\r') {
+                ++lineNumber;
+                crSeen = true;
+            } else if ((c == '\n') && !crSeen) {
+                ++lineNumber;
+            } else {
+                crSeen = false;
+            }
+
             ++offset;
         }
     }
@@ -306,8 +320,19 @@
 
         lineNumber = 1;
         offset = 0;
+        int lastOffset = offset -1;
         while (offset < seq.length())
         {
+            // Sanity check; each pass through this loop must increase the offset.
+            // Failure to do this means a hang situation has occurred.
+            if (offset <= lastOffset)
+            {
+                // throw new RuntimeException("Infinite loop detected in ReducedHTMLParser");
+                log.error("Infinite loop detected in ReducedHTMLParser; parsing skipped");
+                //return;
+            }
+            lastOffset = offset;
+                
             if (state == STATE_READY) {
                 // in this state, nothing but "<" has any significance
                 consumeExcept("<");
@@ -316,30 +341,38 @@
                 }
 
                 if (consumeMatch("<!--")) {
-                    // VERIFY: can "< ! --" start a comment?
+                    // Note that whitespace is *not* permitted in <!--
                     state = STATE_IN_COMMENT;
-                 } else if (consumeMatch("<!")) {
-                     // xml processing instruction or <!DOCTYPE> tag
-                     // we don't need to actually do anything here
-                     log.debug("PI found at line " + getCurrentLineNumber());
+                } else if (consumeMatch("<![")) {
+                    // Start of a "marked section", eg "<![CDATA" or 
+                    // "<![INCLUDE" or "<![IGNORE". These always terminate 
+                    // with "]]>"
+                    log.debug("Marked section found at line " + getCurrentLineNumber());
+                    state = STATE_IN_MARKED_SECTION;
+                } else if (consumeMatch("<!DOCTYPE")) {
+                    log.debug("DOCTYPE found at line " + getCurrentLineNumber());
+                    // we don't need to actually do anything here; the
+                    // tag can't contain a bare "<", so the first "<"
+                    // indicates the start of the next real tag.
+                    //
+                    // TODO: Handle case where the DOCTYPE includes an internal DTD. In
+                    // that case there *will* be embedded < chars in the document. However
+                    // that's very unlikely to be used in a JSF page, so this is pretty low
+                    // priority.
+                } else if (consumeMatch("<?")) {
+                    // xml processing instruction or <!DOCTYPE> tag
+                    // we don't need to actually do anything here; the
+                    // tag can't contain a bare "<", so the first "<"
+                    // indicates the start of the next real tag.
+                    log.debug("PI found at line " + getCurrentLineNumber());
                 } else if (consumeMatch("</")) {
-                    // VERIFY: is "< / foo >" a valid end-tag?
-
-                    int tagStart = offset - 2;
-                    String tagName = consumeElementName();
-                    consumeWhitespace();
-                    if (!consumeMatch(">")) {
-                        throw new Error("Malformed end tag");
+                    if (!processEndTag()) {
+                        // message already logged
+                        return;
                     }
 
-                    // We can't verify that the tag names balance because this is HTML
-                    // we are processing, not XML.
-
                     // stay in state READY
                     state = STATE_READY;
-
-                    // inform user that the tag has been closed
-                    closedTag(tagStart, offset, tagName);
                 } else if (consumeMatch("<")) {
                     // We can't tell the user that the tag has closed until after we have
                     // processed any attributes and found the real end of the tag. So save
@@ -363,9 +396,10 @@
             }
 
             if (state == STATE_IN_COMMENT) {
-                // VERIFY: does "-- >" close a comment?
+                // TODO: handle "--  >", which is a valid way to close a
+                // comment according to the specs.
 
-                // in this state, nothing but "-->" has any significance
+                // in this state, nothing but "--" has any significance
                 consumeExcept("-");
                 if (isFinished()) {
                     break;
@@ -393,8 +427,17 @@
                     currentTagStart = -1;
                     currentTagName = null;
                 } else if (consumeMatch(">")) {
+                    if (currentTagName.equalsIgnoreCase("script") 
+                        || currentTagName.equalsIgnoreCase("style")) {
+                        // We've just started a special tag which can contain anything except
+                        // the ETAGO marker ("</"). See
+                        // http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
+                        state = STATE_EXPECTING_ETAGO;
+                    } else {
+                        state = STATE_READY;
+                    }
+
                     // end of open tag, but not end of element
-                    state = STATE_READY;
                     openedTag(currentTagStart, offset, currentTagName);
                     
                     // and reset vars just in case...
@@ -403,17 +446,94 @@
                 } else {
                     // xml attribute
                     String attrName = consumeAttrName();
-                    consumeWhitespace();
-                    
-                    // html can have "stand-alone" attributes with no following equals sign
-                    if (consumeMatch("=")) {
-                        String attrValue = consumeAttrValue();
+                    if (attrName == null) {
+                        // Oops, we found something quite unexpected in this tag.
+                        // The best we can do is probably to drop back to looking
+                        // for "/>", though that does risk us misinterpreting the
+                        // contents of an attribute's associated string value.
+                        log.warn("Invalid tag found: unexpected input while looking for attr name or '/>'"
+                                + " at line " + getCurrentLineNumber());
+                        state = STATE_EXPECTING_ETAGO;
+                        // and consume one character
+                        ++offset;
+                    } else {
+                        consumeWhitespace();
+                        
+                        // html can have "stand-alone" attributes with no following equals sign
+                        if (consumeMatch("=")) {
+                            String attrValue = consumeAttrValue();
+                        }
                     }
                 }
                 
                 continue;
             }
+
+            if (state == STATE_IN_MARKED_SECTION) {
+                // in this state, nothing but "]]>" has any significance
+                consumeExcept("]");
+                if (isFinished()) {
+                    break;
+                }
+
+                if (consumeMatch("]]>")) {
+                    state = STATE_READY;
+                } else  {
+                    // false call; ] is not end of cdata section
+                    consumeMatch("]");
+                }
+                
+                continue;
+            }
+
+            if (state == STATE_EXPECTING_ETAGO) {
+                // The term "ETAGO" is the official spec term for "</".
+                consumeExcept("<");
+                if (isFinished()) {
+                    log.debug("Malformed input page; input terminated while tag not closed.");
+                    break;
+                }
+
+                if (consumeMatch("</")) {
+                    if (!processEndTag()) {
+                        return;
+                    }
+                    state = STATE_READY;
+                } else  {
+                    // false call; < does not start an ETAGO
+                    consumeMatch("<");
+                }
+                
+                continue;
+            }
+        }
+    }
+
+    /**
+     * Invoked when "&lt;/" has been seen in the input, this method
+     * handles the parsing of the end tag and the invocation of the
+     * appropriate callback method.
+     *  
+     * @return true if the tag was successfully parsed, and false
+     * if there was a fatal parsing error.
+     */
+    private boolean processEndTag() {
+        int tagStart = offset - 2;
+        String tagName = consumeElementName();
+        consumeWhitespace();
+        if (!consumeMatch(">")) {
+            log.error("Malformed end tag at line " + getCurrentLineNumber()
+                    + "; skipping parsing");
+            return false;
         }
+
+
+        // inform user that the tag has been closed
+        closedTag(tagStart, offset, tagName);
+
+        // We can't verify that the tag names balance because this is HTML
+        // we are processing, not XML.
+        return true;
     }
 
     /**

Modified: myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java
URL: http://svn.apache.org/viewcvs/myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java?rev=332326&r1=332325&r2=332326&view=diff
==============================================================================
--- myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java (original)
+++ myfaces/tomahawk/trunk/src/test/org/apache/myfaces/component/html/util/ReducedHTMLParserTest.java Thu Nov 10 08:49:23 2005
@@ -29,20 +29,24 @@
 {
     public static class ParseCallbackListener implements CallbackListener
     {
-        // records the offset immedately after <head>
-        int headerInsertPosition = -1;
-        
-        // records the offset immediately after <body>
-        int bodyInsertPosition = -1;
-        
-        // records the offset immediately before <body>
-        int beforeBodyPosition = -1;
+        int beforeHeadStart = -1;
+        int afterHeadStart = -1;
+        int beforeHeadEnd = -1;
+        int afterHeadEnd = -1;
+        int beforeBodyStart = -1;
+        int afterBodyStart = -1;
+        int beforeBodyEnd = -1;
+        int afterBodyEnd = -1;
 
         public void openedStartTag(int charIndex, int tagIdentifier)
         {
-            if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
+            if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
+            {
+                beforeHeadStart = charIndex;
+            } 
+            else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
             {
-                beforeBodyPosition = charIndex;
+                beforeBodyStart = charIndex;
             }
         }
 
@@ -50,20 +54,36 @@
         {
             if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
             {
-                headerInsertPosition = charIndex;
+                afterHeadStart = charIndex;
             }
             else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
             {
-                bodyInsertPosition = charIndex;
+                afterBodyStart = charIndex;
             }
         }
 
         public void openedEndTag(int charIndex, int tagIdentifier)
         {
+            if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
+            {
+                beforeHeadEnd = charIndex;
+            } 
+            else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
+            {
+                beforeBodyEnd = charIndex;
+            }
         }
 
         public void closedEndTag(int charIndex, int tagIdentifier)
         {
+            if (tagIdentifier == ReducedHTMLParser.HEAD_TAG)
+            {
+                afterHeadEnd = charIndex;
+            }
+            else if (tagIdentifier == ReducedHTMLParser.BODY_TAG)
+            {
+                afterBodyEnd = charIndex;
+            }
         }
 
         public void attribute(int charIndex, int tagIdentifier, String key, String value)
@@ -191,7 +211,8 @@
         assertFalse("Match non-matching pattern", parser.consumeMatch("aa"));
     }
     
-    public void testConsumeElementName() {
+    public void testConsumeElementName() 
+    {
         CharSequence seq = "  foo  t:foo t:FooBar t:foo_bar element-name/>";
         CallbackListener listener = new ParseCallbackListener();
         ReducedHTMLParser parser = new ReducedHTMLParser(seq, listener);
@@ -213,7 +234,8 @@
         assertEquals("Element name matched", "element-name", name5);
     }
     
-    public void testConsumeStringBasic() {
+    public void testConsumeStringBasic() 
+    {
         CharSequence s1 = "'string1' \"string2\"";
         CallbackListener listener = new ParseCallbackListener();
         ReducedHTMLParser parser = new ReducedHTMLParser(s1, listener);
@@ -233,7 +255,8 @@
         assertEquals("String correctly parsed", "string2", str2);
     }
     
-    public void testConsumeStringEscapedQuote() {
+    public void testConsumeStringEscapedQuote() 
+    {
         char quoteMark = '\'';
         
         // build literal sequence 'don\'t quote me' not-in-the-string
@@ -254,7 +277,8 @@
         assertEquals("String correctly parsed", "don't quote me", str1);
     }
     
-    public void testConsumeStringEscapedEscape() {
+    public void testConsumeStringEscapedEscape() 
+    {
         char quoteMark = '\'';
         char backSlash = '\\';
         
@@ -281,7 +305,8 @@
         assertEquals("String correctly parsed", "don" + backSlash, str1);
     }
 
-    public void testConsumeAttrValue() {
+    public void testConsumeAttrValue() 
+    {
         CharSequence seq = "  bare 'quoted 1' \"quoted 2\" bare2 ";
         CallbackListener listener = new ParseCallbackListener();
         ReducedHTMLParser parser = new ReducedHTMLParser(seq, listener);
@@ -299,7 +324,8 @@
         assertEquals("Attr value matched", "bare2", val4);
     }
     
-    public void testConsumeExcept() {
+    public void testConsumeExcept() 
+    {
         CharSequence seq = "abc$$#dd  ee#ff-gghh ii";
         CallbackListener listener = new ParseCallbackListener();
         ReducedHTMLParser parser = new ReducedHTMLParser(seq, listener);
@@ -322,8 +348,104 @@
         parser.consumeExcept("z");
     }
 
+    // test parsing completes when a lessthan is not followed by an element name,
+    // and there is just whitespace up to end of the input.
+    public void testParseBadTagNoElementName1() 
+    {
+        String s = "xxxx \n\n <# \n\n";
+        CallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+    }
+
+    // test parsing completes when a lessthan is not followed by an element name,
+    public void testParseBadTagNoElementName2() 
+    {
+        String s = "xxxx \n\n <# \n\n hi there";
+        CallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+    }
+
+    // test parsing completes when an invalid char is found where an attribute name
+    // is expected.
+    public void testParseBadTagInvalidAttributeName() 
+    {
+        String s = "<foo )/>";
+        CallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+    }
+
+    // test CDATA sections are handled
+    public void testParseCDATA() 
+    {
+        String s = "xx<head> <![CDATA[ <head> ]]> <body>";
+        ParseCallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+        assertEquals("CDATA works", 8, listener.afterHeadStart);
+        assertEquals("CDATA works", 30, listener.beforeBodyStart);
+    }
+
+    // test PI sections are handled
+    public void testParsePI() 
+    {
+        String s = "<?xml version=\"1.0\"?> xx<head> ";
+        ParseCallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+        assertEquals("PI works", 30, listener.afterHeadStart);
+    }
+
+    // Test script element support; the spec states that a <script> or
+    // <style> tag can contain anything except "/>"
+    public void testScript() 
+    {
+        String s1 = "<head>";
+        String s2 = "<script type='text/javascript'>"
+            + "if (1<2) alert('foo');\n"
+            + "if (1>2) alert('bar');\n"
+            + "</script>";
+        String s3 = "</head>";
+        String s4 = "<body>";
+        String s5 = "</body>";
+
+        StringBuffer buf = new StringBuffer();
+        buf.append(s1);
+        buf.append(s2);
+        buf.append(s3);
+        buf.append(s4);
+        buf.append(s5);
+
+        ParseCallbackListener listener = new ParseCallbackListener();
+        ReducedHTMLParser parser = new ReducedHTMLParser(buf.toString(), listener);
+        
+        parser.parse();
+        assertTrue(parser.isFinished());
+        assertEquals("Script works", s1.length(), listener.afterHeadStart);
+        int beforeHeadEnd = s1.length() + s2.length();
+        assertEquals("Script works", beforeHeadEnd, listener.beforeHeadEnd);
+        int beforeBodyStart = beforeHeadEnd + s3.length();
+        assertEquals("Script works", beforeBodyStart, listener.beforeBodyStart);
+        int beforeBodyEnd = beforeBodyStart + s4.length();
+        assertEquals("Script works", beforeBodyEnd, listener.beforeBodyEnd);
+    }
+
     // test the full parse method
-    public void testParse() {
+    public void testParse() 
+    {
+        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
         String s1 = "<html><head>";
         String s2 = "\n<!-- a comment --><title>foo</title>";
         String s3 = "</head>";
@@ -338,6 +460,7 @@
         String s8 = "</body> </html>";
 
         StringBuffer buf = new StringBuffer();
+        buf.append(s0);
         buf.append(s1);
         buf.append(s2);
         buf.append(s3);
@@ -354,13 +477,13 @@
         
         // check that listener has correctly computed the offset to the char just
         // before the </head> tag starts.
-        int afterHeadPos = s1.length();
-        assertEquals("Pos after <head> tag ", afterHeadPos, listener.headerInsertPosition);
+        int afterHeadStart = s0.length() + s1.length();
+        assertEquals("Pos after <head> tag ", afterHeadStart, listener.afterHeadStart);
         
-        int beforeBodyPos = s1.length() + s2.length() + s3.length();
-        assertEquals("Pos before <body> tag", beforeBodyPos, listener.beforeBodyPosition);
+        int beforeBodyStart = afterHeadStart + s2.length() + s3.length();
+        assertEquals("Pos before <body> tag", beforeBodyStart, listener.beforeBodyStart);
         
-        int afterBodyPos = s1.length() + s2.length() + s3.length() + s4.length();
-        assertEquals("Pos after <body> tag", afterBodyPos, listener.bodyInsertPosition);
+        int afterBodyStart = beforeBodyStart + s4.length();
+        assertEquals("Pos after <body> tag", afterBodyStart, listener.afterBodyStart);
     }
 }