You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@lucene.apache.org by ot...@apache.org on 2002/06/30 00:08:27 UTC

cvs commit: jakarta-lucene/src/demo/org/apache/lucene/demo/html HTMLParser.jj

otis        2002/06/29 15:08:27

  Modified:    src/demo/org/apache/lucene/demo/html HTMLParser.jj
  Log:
  - Improved HTML parser that allows one to get HTML document's meta tags' values.
  Submitted by:	Mark Harwood
  Reviewed by:	otis
  
  Revision  Changes    Path
  1.2       +48 -5     jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj
  
  Index: HTMLParser.jj
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- HTMLParser.jj	26 Jan 2002 15:01:31 -0000	1.1
  +++ HTMLParser.jj	29 Jun 2002 22:08:26 -0000	1.2
  @@ -66,15 +66,20 @@
   package org.apache.lucene.demo.html;
   
   import java.io.*;
  +import java.util.Properties;
   
   public class HTMLParser {
     public static int SUMMARY_LENGTH = 200;
  -  
  +
     StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
     StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  +  Properties metaTags=new Properties();
  +  String currentMetaTag="";
     int length = 0;
     boolean titleComplete = false;
     boolean inTitle = false;
  +  boolean inMetaTag = false;
  +  boolean inStyle = false;
     boolean inScript = false;
     boolean afterTag = false;
     boolean afterSpace = false;
  @@ -99,6 +104,21 @@
       return title.toString().trim();
     }
   
  +  public Properties getMetaTags() throws IOException,
  +InterruptedException {
  +    if (pipeIn == null)
  +      getReader();				  // spawn parsing thread
  +    while (true) {
  +      synchronized(this) {
  +	if (titleComplete || (length > SUMMARY_LENGTH))
  +	  break;
  +	wait(10);
  +      }
  +    }
  +    return metaTags;
  +  }
  +
  +
     public String getSummary() throws IOException, InterruptedException {
       if (pipeIn == null)
         getReader();				  // spawn parsing thread
  @@ -124,7 +144,7 @@
       if (pipeIn == null) {
         pipeIn = new PipedReader();
         pipeOut = new PipedWriter(pipeIn);
  -      
  +
         Thread thread = new ParserThread(this);
         thread.start();				  // start parsing
       }
  @@ -146,6 +166,13 @@
     void addText(String text) throws IOException {
       if (inScript)
         return;
  +    if (inStyle)
  +      return;
  +    if (inMetaTag)
  +    {
  +	metaTags.setProperty(currentMetaTag, text);
  +      	return;
  +    }
       if (inTitle)
         title.append(text);
       else {
  @@ -163,7 +190,7 @@
   
       afterSpace = false;
     }
  -  
  +
     void addSpace() throws IOException {
       if (inScript)
         return;
  @@ -172,7 +199,7 @@
   	title.append(" ");
         else
   	addToSummary(" ");
  -      
  +
         String space = afterTag ? eol : " ";
         length += space.length();
         pipeOut.write(space);
  @@ -220,6 +247,8 @@
   {
     t1=<TagName> {
       inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
  +    inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
  +    inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
       inImg = t1.image.equalsIgnoreCase("<img");	  // keep track if in <IMG>
       if (inScript) {				  // keep track if in <SCRIPT>
         inScript = !t1.image.equalsIgnoreCase("</script");
  @@ -233,6 +262,20 @@
        {
          if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
            addText("[" + t2.image + "]");
  +
  +    	if(inMetaTag &&
  +			(  t1.image.equalsIgnoreCase("name") ||
  +			   t1.image.equalsIgnoreCase("HTTP-EQUIV")
  +			)
  +	   && t2 != null)
  +	{
  +		currentMetaTag=t2.image.toLowerCase();
  +	}
  +    	if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
  +null)
  +	{
  +		addText(t2.image);
  +	}
        }
       )?
      )?
  @@ -272,7 +315,7 @@
    |
     (<Comment2> ( <CommentText2> )* <CommentEnd2>)
   }
  -  
  +
   
   TOKEN :
   {
  
  
  

--
To unsubscribe, e-mail:   <ma...@jakarta.apache.org>
For additional commands, e-mail: <ma...@jakarta.apache.org>