You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@myfaces.apache.org by Simon Kitching <sk...@obsidium.com> on 2005/11/01 00:08:34 UTC

Re: ReducedHTMLParser issues

Martin Marinschek wrote:
> Don't stress yourself - it's just the nightly build, so not to big of a problem.

Thanks, but it's hopefully done anyway.

changes:
  * Handle DOCTYPE and Processing Instruction commands in input HTML
  * Track line# of input for error messages
  * Remove some debugging printlns

I can also provide a patch soon to format the code to the MyFaces 
convention rather than the Sun convention if you wish. Sorry, my Eclipse 
is set up to format stuff that way automatically and I forgot to 
reformat before posting.

Regards,

Simon

Re: ReducedHTMLParser issues

Posted by Martin Marinschek <ma...@gmail.com>.
Ok,

even though the patch didn't work - I applied it line by line for the
parser as I needed this thing working right now.

Can you redo the test patch in any case?

regards,

Martin

On 11/1/05, Martin Marinschek <ma...@gmail.com> wrote:
> Simon,
>
> I don't seem to be able to apply your patch again - an 'unknown line
> type was found in line 12'.
>
> Can you do it again - and attach it to our old jira-issue, I have
> reopened it for this purpose.
>
> regards,
>
> Martin
>
> On 11/1/05, Simon Kitching <sk...@obsidium.com> wrote:
> > Martin Marinschek wrote:
> > > Don't stress yourself - it's just the nightly build, so not to big of a problem.
> >
> > Thanks, but it's hopefully done anyway.
> >
> > changes:
> >   * Handle DOCTYPE and Processing Instruction commands in input HTML
> >   * Track line# of input for error messages
> >   * Remove some debugging printlns
> >
> > I can also provide a patch soon to format the code to the MyFaces
> > convention rather than the Sun convention if you wish. Sorry, my Eclipse
> > is set up to format stuff that way automatically and I forgot to
> > reformat before posting.
> >
> > Regards,
> >
> > Simon
> >
> >
> > Index: ReducedHTMLParser.java
> > ===================================================================
> > --- ReducedHTMLParser.java      (revision 329922)
> > +++ ReducedHTMLParser.java      (working copy)
> > @@ -49,6 +49,7 @@
> >      private static final int STATE_IN_TAG = 2;
> >
> >      private int offset;
> > +    private int lineNumber;
> >      private CharSequence seq;
> >      private CallbackListener listener;
> >
> > @@ -75,15 +76,32 @@
> >          return offset >= seq.length();
> >      }
> >
> > +    int getCurrentLineNumber() {
> > +        return lineNumber;
> > +    }
> > +
> >      /**
> >       * Advance the current parse position over any whitespace characters.
> >       */
> >      void consumeWhitespace() {
> > +        boolean crSeen = false;
> > +
> >          while (offset < seq.length()) {
> >              char c = seq.charAt(offset);
> >              if (!Character.isWhitespace(c)) {
> >                  break;
> >              }
> > +
> > +            // Track line number for error messages.
> > +            if (c == '\r') {
> > +                ++lineNumber;
> > +                crSeen = true;
> > +            } else if ((c == '\n') && !crSeen) {
> > +                ++lineNumber;
> > +            } else {
> > +                crSeen = false;
> > +            }
> > +
> >              ++offset;
> >          }
> >      }
> > @@ -193,6 +211,10 @@
> >          // TODO: should we consider a string to be terminated by a newline?
> >          // that would help with runaway strings but I think that multiline
> >          // strings *are* allowed...
> > +        //
> > +        // TODO: detect newlines within strings and increment lineNumber.
> > +        // This isn't so important, though; they aren't common and being a
> > +        // few lines out in an error message isn't serious either.
> >          StringBuffer stringBuf = new StringBuffer();
> >          boolean escaping = false;
> >          while (!isFinished()) {
> > @@ -248,6 +270,8 @@
> >       * @param s is a set of characters that should not be discarded.
> >       */
> >      void consumeExcept(String s) {
> > +        boolean crSeen = false;
> > +
> >          while (offset < seq.length()) {
> >              char c = seq.charAt(offset);
> >              if (s.indexOf(c) >= 0) {
> > @@ -255,6 +279,16 @@
> >                  return;
> >              }
> >
> > +            // Track line number for error messages.
> > +            if (c == '\r') {
> > +                ++lineNumber;
> > +                crSeen = true;
> > +            } else if ((c == '\n') && !crSeen) {
> > +                ++lineNumber;
> > +            } else {
> > +                crSeen = false;
> > +            }
> > +
> >              ++offset;
> >          }
> >      }
> > @@ -269,6 +303,7 @@
> >          int currentTagStart = -1;
> >          String currentTagName = null;
> >
> > +        lineNumber = 1;
> >          offset = 0;
> >          while (offset < seq.length())
> >          {
> > @@ -282,6 +317,10 @@
> >                  if (consumeMatch("<!--")) {
> >                      // VERIFY: can "< ! --" start a comment?
> >                      state = STATE_IN_COMMENT;
> > +                } else if (consumeMatch("<!")) {
> > +                    // xml processing instruction or <!DOCTYPE> tag
> > +                    // we don't need to actually do anything here
> > +                    log.debug("PI found at line " + getCurrentLineNumber());
> >                  } else if (consumeMatch("</")) {
> >                      // VERIFY: is "< / foo >" a valid end-tag?
> >
> > @@ -306,10 +345,17 @@
> >                      // the current info until the end of this tag.
> >                      currentTagStart = offset - 1;
> >                      currentTagName = consumeElementName();
> > -                    state = STATE_IN_TAG;
> > +                    if (currentTagName == null) {
> > +                        log.warn("Invalid HTML; bare lessthan sign found at line "
> > +                            + getCurrentLineNumber());
> > +                        // remain in STATE_READY; this isn't really the start of
> > +                        // an xml element.
> > +                    } else {
> > +                        state = STATE_IN_TAG;
> > +                    }
> >                  } else {
> >                      // should never get here
> > -                    throw new Error("Internal error");
> > +                    throw new Error("Internal error at line " + getCurrentLineNumber());
> >                  }
> >
> >                  continue;
> > @@ -378,7 +424,6 @@
> >       */
> >      void openedTag(int startOffset, int endOffset, String tagName) {
> >          log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
> > -        System.out.println("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
> >
> >          if ("head".equalsIgnoreCase(tagName)) {
> >              listener.openedStartTag(startOffset, HEAD_TAG);
> > @@ -394,7 +439,6 @@
> >
> >      void closedTag(int startOffset, int endOffset, String tagName) {
> >          log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
> > -        System.out.println("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
> >
> >          if ("head".equalsIgnoreCase(tagName)) {
> >              listener.openedEndTag(startOffset, HEAD_TAG);
> >
> >
> > Index: ReducedHTMLParserTest.java
> > ===================================================================
> > --- ReducedHTMLParserTest.java  (revision 329925)
> > +++ ReducedHTMLParserTest.java  (working copy)
> > @@ -322,8 +322,19 @@
> >          parser.consumeExcept("z");
> >      }
> >
> > +    // test parsing completes when invalid tag found.
> > +    public void testParseBadTag() {
> > +        String s = "xxxx \n\n <# \n\n";
> > +        CallbackListener listener = new ParseCallbackListener();
> > +        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
> > +
> > +        parser.parse();
> > +        assertTrue(parser.isFinished());
> > +    }
> > +
> >      // test the full parse method
> >      public void testParse() {
> > +        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
> >          String s1 = "<html><head>";
> >          String s2 = "\n<!-- a comment --><title>foo</title>";
> >          String s3 = "</head>";
> > @@ -338,6 +349,7 @@
> >          String s8 = "</body> </html>";
> >
> >          StringBuffer buf = new StringBuffer();
> > +        buf.append(s0);
> >          buf.append(s1);
> >          buf.append(s2);
> >          buf.append(s3);
> > @@ -354,13 +366,13 @@
> >
> >          // check that listener has correctly computed the offset to the char just
> >          // before the </head> tag starts.
> > -        int afterHeadPos = s1.length();
> > +        int afterHeadPos = s0.length() + s1.length();
> >          assertEquals("Pos after <head> tag ", afterHeadPos, listener.headerInsertPosition);
> >
> > -        int beforeBodyPos = s1.length() + s2.length() + s3.length();
> > +        int beforeBodyPos = afterHeadPos + s2.length() + s3.length();
> >          assertEquals("Pos before <body> tag", beforeBodyPos, listener.beforeBodyPosition);
> >
> > -        int afterBodyPos = s1.length() + s2.length() + s3.length() + s4.length();
> > +        int afterBodyPos = beforeBodyPos + s4.length();
> >          assertEquals("Pos after <body> tag", afterBodyPos, listener.bodyInsertPosition);
> >      }
> >  }
> >
> >
> >
>
>
> --
>
> http://www.irian.at
> Your JSF powerhouse -
> JSF Trainings in English and German
>


--

http://www.irian.at
Your JSF powerhouse -
JSF Trainings in English and German

Re: ReducedHTMLParser issues

Posted by Martin Marinschek <ma...@gmail.com>.
Simon,

I don't seem to be able to apply your patch again - an 'unknown line
type was found in line 12'.

Can you do it again - and attach it to our old jira-issue, I have
reopened it for this purpose.

regards,

Martin

On 11/1/05, Simon Kitching <sk...@obsidium.com> wrote:
> Martin Marinschek wrote:
> > Don't stress yourself - it's just the nightly build, so not to big of a problem.
>
> Thanks, but it's hopefully done anyway.
>
> changes:
>   * Handle DOCTYPE and Processing Instruction commands in input HTML
>   * Track line# of input for error messages
>   * Remove some debugging printlns
>
> I can also provide a patch soon to format the code to the MyFaces
> convention rather than the Sun convention if you wish. Sorry, my Eclipse
> is set up to format stuff that way automatically and I forgot to
> reformat before posting.
>
> Regards,
>
> Simon
>
>
> Index: ReducedHTMLParser.java
> ===================================================================
> --- ReducedHTMLParser.java      (revision 329922)
> +++ ReducedHTMLParser.java      (working copy)
> @@ -49,6 +49,7 @@
>      private static final int STATE_IN_TAG = 2;
>
>      private int offset;
> +    private int lineNumber;
>      private CharSequence seq;
>      private CallbackListener listener;
>
> @@ -75,15 +76,32 @@
>          return offset >= seq.length();
>      }
>
> +    int getCurrentLineNumber() {
> +        return lineNumber;
> +    }
> +
>      /**
>       * Advance the current parse position over any whitespace characters.
>       */
>      void consumeWhitespace() {
> +        boolean crSeen = false;
> +
>          while (offset < seq.length()) {
>              char c = seq.charAt(offset);
>              if (!Character.isWhitespace(c)) {
>                  break;
>              }
> +
> +            // Track line number for error messages.
> +            if (c == '\r') {
> +                ++lineNumber;
> +                crSeen = true;
> +            } else if ((c == '\n') && !crSeen) {
> +                ++lineNumber;
> +            } else {
> +                crSeen = false;
> +            }
> +
>              ++offset;
>          }
>      }
> @@ -193,6 +211,10 @@
>          // TODO: should we consider a string to be terminated by a newline?
>          // that would help with runaway strings but I think that multiline
>          // strings *are* allowed...
> +        //
> +        // TODO: detect newlines within strings and increment lineNumber.
> +        // This isn't so important, though; they aren't common and being a
> +        // few lines out in an error message isn't serious either.
>          StringBuffer stringBuf = new StringBuffer();
>          boolean escaping = false;
>          while (!isFinished()) {
> @@ -248,6 +270,8 @@
>       * @param s is a set of characters that should not be discarded.
>       */
>      void consumeExcept(String s) {
> +        boolean crSeen = false;
> +
>          while (offset < seq.length()) {
>              char c = seq.charAt(offset);
>              if (s.indexOf(c) >= 0) {
> @@ -255,6 +279,16 @@
>                  return;
>              }
>
> +            // Track line number for error messages.
> +            if (c == '\r') {
> +                ++lineNumber;
> +                crSeen = true;
> +            } else if ((c == '\n') && !crSeen) {
> +                ++lineNumber;
> +            } else {
> +                crSeen = false;
> +            }
> +
>              ++offset;
>          }
>      }
> @@ -269,6 +303,7 @@
>          int currentTagStart = -1;
>          String currentTagName = null;
>
> +        lineNumber = 1;
>          offset = 0;
>          while (offset < seq.length())
>          {
> @@ -282,6 +317,10 @@
>                  if (consumeMatch("<!--")) {
>                      // VERIFY: can "< ! --" start a comment?
>                      state = STATE_IN_COMMENT;
> +                } else if (consumeMatch("<!")) {
> +                    // xml processing instruction or <!DOCTYPE> tag
> +                    // we don't need to actually do anything here
> +                    log.debug("PI found at line " + getCurrentLineNumber());
>                  } else if (consumeMatch("</")) {
>                      // VERIFY: is "< / foo >" a valid end-tag?
>
> @@ -306,10 +345,17 @@
>                      // the current info until the end of this tag.
>                      currentTagStart = offset - 1;
>                      currentTagName = consumeElementName();
> -                    state = STATE_IN_TAG;
> +                    if (currentTagName == null) {
> +                        log.warn("Invalid HTML; bare lessthan sign found at line "
> +                            + getCurrentLineNumber());
> +                        // remain in STATE_READY; this isn't really the start of
> +                        // an xml element.
> +                    } else {
> +                        state = STATE_IN_TAG;
> +                    }
>                  } else {
>                      // should never get here
> -                    throw new Error("Internal error");
> +                    throw new Error("Internal error at line " + getCurrentLineNumber());
>                  }
>
>                  continue;
> @@ -378,7 +424,6 @@
>       */
>      void openedTag(int startOffset, int endOffset, String tagName) {
>          log.debug("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
> -        System.out.println("Found open tag at " + startOffset + ":" + endOffset + ":" + tagName);
>
>          if ("head".equalsIgnoreCase(tagName)) {
>              listener.openedStartTag(startOffset, HEAD_TAG);
> @@ -394,7 +439,6 @@
>
>      void closedTag(int startOffset, int endOffset, String tagName) {
>          log.debug("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
> -        System.out.println("Found close tag at " + startOffset + ":" + endOffset + ":" + tagName);
>
>          if ("head".equalsIgnoreCase(tagName)) {
>              listener.openedEndTag(startOffset, HEAD_TAG);
>
>
> Index: ReducedHTMLParserTest.java
> ===================================================================
> --- ReducedHTMLParserTest.java  (revision 329925)
> +++ ReducedHTMLParserTest.java  (working copy)
> @@ -322,8 +322,19 @@
>          parser.consumeExcept("z");
>      }
>
> +    // test parsing completes when invalid tag found.
> +    public void testParseBadTag() {
> +        String s = "xxxx \n\n <# \n\n";
> +        CallbackListener listener = new ParseCallbackListener();
> +        ReducedHTMLParser parser = new ReducedHTMLParser(s, listener);
> +
> +        parser.parse();
> +        assertTrue(parser.isFinished());
> +    }
> +
>      // test the full parse method
>      public void testParse() {
> +        String s0 = "<!DOCTYPE PUBLIC \"sss\" \"http:foo\">\n";
>          String s1 = "<html><head>";
>          String s2 = "\n<!-- a comment --><title>foo</title>";
>          String s3 = "</head>";
> @@ -338,6 +349,7 @@
>          String s8 = "</body> </html>";
>
>          StringBuffer buf = new StringBuffer();
> +        buf.append(s0);
>          buf.append(s1);
>          buf.append(s2);
>          buf.append(s3);
> @@ -354,13 +366,13 @@
>
>          // check that listener has correctly computed the offset to the char just
>          // before the </head> tag starts.
> -        int afterHeadPos = s1.length();
> +        int afterHeadPos = s0.length() + s1.length();
>          assertEquals("Pos after <head> tag ", afterHeadPos, listener.headerInsertPosition);
>
> -        int beforeBodyPos = s1.length() + s2.length() + s3.length();
> +        int beforeBodyPos = afterHeadPos + s2.length() + s3.length();
>          assertEquals("Pos before <body> tag", beforeBodyPos, listener.beforeBodyPosition);
>
> -        int afterBodyPos = s1.length() + s2.length() + s3.length() + s4.length();
> +        int afterBodyPos = beforeBodyPos + s4.length();
>          assertEquals("Pos after <body> tag", afterBodyPos, listener.bodyInsertPosition);
>      }
>  }
>
>
>


--

http://www.irian.at
Your JSF powerhouse -
JSF Trainings in English and German