You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-user@lucene.apache.org by Jan Francsi <vi...@gmx.de> on 2006/12/21 10:57:55 UTC

How to skip menu structure while parsing HTML sites?

Hello!

I'm programming a small search engine using apache lucene.
While indexing I've noticed that the menu has to be removed from the
index, because it influences the search result (searching terms that are in the menu gives all pages of the web directory as result).

Now, I wan't to put the menu into some custom tags like <!--beginM-->
... <!--endM-->.
One of Lucenes demos uses a javacc generated parser so extract Text out of HTML.
How can I modify the source of the *.jj file to skip everything between
the too custom tags?
I've tryed

SKIP:
{
        "<!--beginM-->" (~["\n", "\r"])* "<!--endM-->"

}

but it doesn't have any effect.

Here is the source of the *.jj file given by apache and modifyed by me
to process UTF-8:

// HTMLParser.jj

options {
  STATIC = false;
  OPTIMIZE_TOKEN_MANAGER = true;
  //DEBUG_LOOKAHEAD = true;
  //DEBUG_TOKEN_MANAGER = true;
  JAVA_UNICODE_ESCAPE = false;
  UNICODE_INPUT = true;

}

PARSER_BEGIN(HTMLParser)

package org.apache.lucene.demo.html;

import java.io.*;
import java.util.Properties;

public class HTMLParser {
  public static int SUMMARY_LENGTH = 200;

  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  Properties metaTags=new Properties();
  String currentMetaTag=null;
  String currentMetaContent=null;
  int length = 0;
  boolean titleComplete = false;
  boolean inTitle = false;
  boolean inMetaTag = false;
  boolean inStyle = false;
  boolean afterTag = false;
  boolean afterSpace = false;
  String eol = System.getProperty("line.separator");
  Reader pipeIn = null;
  Writer pipeOut;
  private MyPipedInputStream pipeInStream = null;
  private PipedOutputStream pipeOutStream = null;

  private class MyPipedInputStream extends PipedInputStream{

    public MyPipedInputStream(){
      super();
    }

    public MyPipedInputStream(PipedOutputStream src) throws
IOException{
      super(src);
    }

    public boolean full() throws IOException{
      return this.available() >= PipedInputStream.PIPE_SIZE;
    }
  }

  /**
   * @deprecated Use HTMLParser(FileInputStream) instead
   */
  public HTMLParser(File file) throws FileNotFoundException {
    this(new FileInputStream(file));
  }

  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();                                // spawn parsing thread
    while (true) {
      synchronized(this) {
        if (titleComplete || pipeInStream.full())
          break;
        wait(10);
      }
    }
    return title.toString().trim();
  }

  public Properties getMetaTags() throws IOException,
InterruptedException {
    if (pipeIn == null)
      getReader();                                // spawn parsing thread
    while (true) {
      synchronized(this) {
        if (titleComplete || pipeInStream.full())
          break;
        wait(10);
      }
    }
    return metaTags;
  }

  public String getSummary() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();                                // spawn parsing thread
    while (true) {
      synchronized(this) {
        if (summary.length() >= SUMMARY_LENGTH || pipeInStream.full())
          break;
        wait(10);
      }
    }
    if (summary.length() > SUMMARY_LENGTH)
      summary.setLength(SUMMARY_LENGTH);

    String sum = summary.toString().trim();
    String tit = getTitle();
    if (sum.startsWith(tit) || sum.equals(""))
      return tit;
    else
      return sum;
  }

  public Reader getReader() throws IOException {
    if (pipeIn == null) {
      pipeInStream = new MyPipedInputStream();
      pipeOutStream = new PipedOutputStream(pipeInStream);
      pipeIn = new InputStreamReader(pipeInStream, "UTF-8");
      pipeOut = new OutputStreamWriter(pipeOutStream, "UTF-8");

      Thread thread = new ParserThread(this);
      thread.start();                             // start parsing
    }

    return pipeIn;
  }

  void addToSummary(String text) {
    if (summary.length() < SUMMARY_LENGTH) {
      summary.append(text);
      if (summary.length() >= SUMMARY_LENGTH) {
        synchronized(this) {
          notifyAll();
        }
      }
    }
  }

  void addText(String text) throws IOException {
    if (inStyle)
      return;
    if (inTitle)
      title.append(text);
    else {
      addToSummary(text);
      if (!titleComplete && !title.equals("")) {  // finished title
        synchronized(this) {
          titleComplete = true;                   // tell waiting threads
          notifyAll();
        }
      }
    }

    length += text.length();
    pipeOut.write(text);

    afterSpace = false;
  }

  void addMetaTag() {
      metaTags.setProperty(currentMetaTag, currentMetaContent);
      currentMetaTag = null;
      currentMetaContent = null;
      return;
  }

  void addSpace() throws IOException {
    if (!afterSpace) {
      if (inTitle)
        title.append(" ");
      else
        addToSummary(" ");

      String space = afterTag ? eol : " ";
      length += space.length();
      pipeOut.write(space);
      afterSpace = true;
    }
  }

//    void handleException(Exception e) {
//      System.out.println(e.toString());  // print the error message
//      System.out.println("Skipping...");
//      Token t;
//      do {
//        t = getNextToken();
//      } while (t.kind != TagEnd);
//    }

}

PARSER_END(HTMLParser)

void HTMLDocument() throws IOException :
{
  Token t;
}

{
//  try {
    ( Tag()         { afterTag = true; }
    | t=Decl()      { afterTag = true; }
    | CommentTag()  { afterTag = true; }
    | ScriptTag()  { afterTag = true; }
    | t=<Word>      { addText(t.image); afterTag = false; }
    | t=<Entity>    { addText(Entities.decode(t.image)); afterTag =
false; }
    | t=<Punct>     { addText(t.image); afterTag = false; }
    | <Space>       { addSpace(); afterTag = false; }
    )* <EOF>
//  } catch (ParseException e) {
//    handleException(e);
//  }

}

void Tag() throws IOException :
{
  Token t1, t2;
  boolean inImg = false;
}

{
  t1=<TagName> {
   String tagName = t1.image.toLowerCase();
   if(Tags.WS_ELEMS.contains(tagName) ) {
      addSpace();
    }
    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in
<TITLE>
    inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in
<META>
    inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in
<STYLE>
    inImg = tagName.equalsIgnoreCase("<img");        // keep track if in
<IMG>
  }
  (t1=<ArgName>
   (<ArgEquals>
    (t2=ArgValue()                                // save ALT text in IMG tag
     {
       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
         addText("[" + t2.image + "]");

        if(inMetaTag &&
                        (  t1.image.equalsIgnoreCase("name") ||
                           t1.image.equalsIgnoreCase("HTTP-EQUIV")
                        )
           && t2 != null)
        {
                currentMetaTag=t2.image.toLowerCase();
                if(currentMetaTag != null && currentMetaContent != null) {
                addMetaTag();
                }
        }
        if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 !=
null)
        {
                currentMetaContent=t2.image.toLowerCase();
                if(currentMetaTag != null && currentMetaContent != null) {
                addMetaTag();
                }
        }
     }
    )?
   )?
  )*
  <TagEnd>

}

Token ArgValue() :
{
  Token t = null;
}

{
  t=<ArgValue>                              { return t; }
| LOOKAHEAD(2)
  <ArgQuote1> <CloseQuote1>                 { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1>  { return t; }
| LOOKAHEAD(2)
  <ArgQuote2> <CloseQuote2>                 { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2>  { return t; }

}

Token Decl() :
{
  Token t;
}

{
  t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
  { return t; }

}

void CommentTag() :
{}
{
  (<Comment1> ( <CommentText1> )* <CommentEnd1>)
 |
  (<Comment2> ( <CommentText2> )* <CommentEnd2>)

}

void ScriptTag() :
{}
{
  <ScriptStart> ( <ScriptText> )* <ScriptEnd>

}

TOKEN :
{
  < ScriptStart: "<script" > : WithinScript
| < TagName:  "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag

| < Comment1:  "<!--" > : WithinComment1
| < Comment2:  "<!" >   : WithinComment2

| < Word:     ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
                <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM> )+

| < #LET:     ["A"-"Z","a"-"z","0"-"9"] >
| < #NUM:     ["0"-"9"] >
| < #HEX:     ["0"-"9","A"-"F","a"-"f"] >

| < Entity:   ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+
(";")? | "&" "#" ["X","x"] (<HEX>)+ (";")? ) >

| < Space:    (<SP>)+ >
| < #SP:      [" ","\t","\r","\n"] >

| < Punct:    ~[] > // Keep this last.  It is a catch-all.

}

<WithinScript> TOKEN:
{
  < ScriptText:  (~["<",">"])+ | "<" | ">" >
| < ScriptEnd: "</script" (~["<",">"])* ">" > : DEFAULT

}

<WithinTag> TOKEN:
{
  < ArgName:   (~[" ","\t","\r","\n","=",">","'","\""])
               (~[" ","\t","\r","\n","=",">"])* >
| < ArgEquals: "=" >  : AfterEquals
| < TagEnd:    ">" | "=>" >  : DEFAULT

}

<AfterEquals> TOKEN:
{
  < ArgValue:  (~[" ","\t","\r","\n","=",">","'","\""])
               (~[" ","\t","\r","\n",">"])* > : WithinTag

}

<WithinTag, AfterEquals> TOKEN:
{
  < ArgQuote1: "'"  > : WithinQuote1
| < ArgQuote2: "\"" > : WithinQuote2

}

<WithinTag, AfterEquals> SKIP:
{
  < <Space> >

}

<WithinQuote1> TOKEN:
{
  < Quote1Text:  (~["'"])+ >
| < CloseQuote1: <ArgQuote1> > : WithinTag

}

<WithinQuote2> TOKEN:
{
  < Quote2Text:  (~["\""])+ >
| < CloseQuote2: <ArgQuote2> > : WithinTag

}

<WithinComment1> TOKEN :
{
  < CommentText1:  (~["-"])+ | "-" >
| < CommentEnd1:   "-->" > : DEFAULT

}

<WithinComment2> TOKEN :
{
  < CommentText2:  (~[">"])+ >
| < CommentEnd2:   ">" > : DEFAULT

Please help!
Seasons greetings and happy hollydays,
Jan
-- 
Jan Francsi
virtualj@gmx.de
www.francsi.de


Der GMX SmartSurfer hilft bis zu 70% Ihrer Onlinekosten zu sparen! 
Ideal für Modem und ISDN: http://www.gmx.net/de/go/smartsurfer

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org