You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@cocoon.apache.org by st...@apache.org on 2001/12/12 00:00:25 UTC

cvs commit: xml-cocoon2/src/org/apache/cocoon/components/lucene CocoonCrawler.java CocoonErrorHandler.java IndexHelperField.java LuceneCocoonHelper.java LuceneCocoonIndexer.java LuceneCocoonPager.java LuceneCocoonSearcher.java LuceneIndexContentHandler.java LuceneXMLIndexer.java SimpleCocoonCrawlerImpl.java SimpleLuceneCocoonIndexerImpl.java SimpleLuceneCocoonSearcherImpl.java SimpleLuceneXMLIndexerImpl.java

stefano     01/12/11 15:00:25

  Added:       src/org/apache/cocoon/components/lucene CocoonCrawler.java
                        CocoonErrorHandler.java IndexHelperField.java
                        LuceneCocoonHelper.java LuceneCocoonIndexer.java
                        LuceneCocoonPager.java LuceneCocoonSearcher.java
                        LuceneIndexContentHandler.java
                        LuceneXMLIndexer.java SimpleCocoonCrawlerImpl.java
                        SimpleLuceneCocoonIndexerImpl.java
                        SimpleLuceneCocoonSearcherImpl.java
                        SimpleLuceneXMLIndexerImpl.java
  Log:
  adding the search components
  
  Revision  Changes    Path
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/CocoonCrawler.java
  
  Index: CocoonCrawler.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.framework.component.Component;
  
  import java.util.*;
  import java.net.*;
  
  /**
   * A cocoon crawler component
   */
  public interface CocoonCrawler extends Component {
    public final static String ROLE = "org.apache.cocoon.components.lucene.CocoonCrawler";
  
    /**
     * start crawlin the URL
     */
    public void crawl( URL url );
  
    /**
     * iterate over crawled URL
     */
    public Iterator iterator();
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/CocoonErrorHandler.java
  
  Index: CocoonErrorHandler.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  /**
   * Title:        lucene
   * Description:  Demo files using lucene indexer&searcher
   * Copyright:    Copyright (c) 2001
   * Company:
   * @author Bernhard Huber
   * @version 1.0
   */
  
  import org.xml.sax.SAXException;
  import org.xml.sax.ErrorHandler;
  import org.xml.sax.SAXParseException;
  
  public class CocoonErrorHandler  implements ErrorHandler
  {
    private String getExceptionInfo(SAXParseException exception) {
      StringBuffer sb = new StringBuffer();
      sb.append("pubId: ").append(exception.getPublicId());
      sb.append(", ");
      sb.append("sysId: ").append(exception.getSystemId());
      sb.append(", ");
      sb.append("col: ").append(String.valueOf(exception.getColumnNumber()));
      sb.append(", ");
      sb.append("line: ").append(String.valueOf(exception.getLineNumber()));
      sb.append(" ");
      return sb.toString();
    }
  
    public void error(SAXParseException exception) {
      System.err.println("CocoonErrorHandler : " +
        getExceptionInfo(exception) +
        "error: " +
        exception.getMessage());
    }
  
    public void fatalError(SAXParseException exception) {
      System.err.println("CocoonErrorHandler : " +
        getExceptionInfo(exception) +
        "fatalError: " +
        exception.getMessage());
    }
  
    public void warning(SAXParseException exception) {
      System.err.println("CocoonErrorHandler : " +
        getExceptionInfo(exception) +
        "warning: " +
        exception.getMessage());
    }
  }
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/IndexHelperField.java
  
  Index: IndexHelperField.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  /**
   * Title:        lucene
   * Description:  Demo files using lucene indexer&searcher
   * Copyright:    Copyright (c) 2001
   * Company:
   * @author Bernhard Huber
   * @version 1.0
   */
  
  import org.xml.sax.ContentHandler;
  import org.xml.sax.ErrorHandler;
  import org.xml.sax.Locator;
  import org.xml.sax.InputSource;
  import org.xml.sax.SAXException;
  import org.xml.sax.SAXParseException;
  import org.xml.sax.XMLReader;
  import org.xml.sax.Attributes;
  
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.DateField;
  
  class IndexHelperField {
    String localFieldName;
    String qualifiedFieldName;
    StringBuffer text;
    Attributes attributes;
  
    IndexHelperField(String lfn, String qfn, Attributes atts) {
      this.localFieldName = lfn;
      this.qualifiedFieldName = qfn;
      this.attributes = atts;
      this.text = new StringBuffer();
    }
  
    public String getLocalFieldName() {
      return localFieldName;
    }
  
    public String getQualifiedFieldName() {
      return qualifiedFieldName;
    }
  
    public Attributes getAttributes() {
      return attributes;
    }
  
    public String getText() {
      return text.toString();
    }
  
    public void appendText(String text) {
      this.text.append(text);
    }
    public void appendText(char[] str, int offset, int length) {
      this.text.append(str, offset, length);
    }
  }
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonHelper.java
  
  Index: LuceneCocoonHelper.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.lucene.store.*;
  import org.apache.lucene.index.*;
  import org.apache.lucene.analysis.Analyzer;
  
  import java.io.File;
  import java.io.IOException;
  
  /**
   * This class encapsulates some helper methods.
   *
   */
  public class LuceneCocoonHelper {
  
    public static Directory getDirectory( File directory, boolean create ) throws IOException {
      FSDirectory fsDirectory = FSDirectory.getDirectory( directory, create );
      return fsDirectory;
    }
    
    public static Analyzer getAnalyzer( String analyzer_class_name ) {
      Analyzer analyzer = null;
      try {
        Class analyzer_class = Class.forName( analyzer_class_name );
        analyzer = (Analyzer)analyzer_class.newInstance();
      } catch (Exception e) {
      }
      return analyzer;
    }
    
    public static IndexReader getIndexReader( Directory directory ) throws IOException {
      IndexReader reader = IndexReader.open( directory );
      return reader;
    }
    
    public static IndexWriter getIndexWriter( Directory index, Analyzer analyzer, boolean create ) throws IOException {
      IndexWriter writer = new IndexWriter( index, analyzer, create );
      return writer;
    }
  
    
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonIndexer.java
  
  Index: LuceneCocoonIndexer.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.framework.component.Component;
  import org.apache.cocoon.ProcessingException;
  
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.store.Directory;
  
  import java.net.URL;
  
  public interface LuceneCocoonIndexer extends Component {
    
    public final static String ROLE = "org.apache.cocoon.components.lucene.LuceneCocoonIndexer";
  
    public void setAnalyzer( Analyzer analyzer );
    
    public void index( Directory index, boolean create, URL base_url  ) 
      throws ProcessingException;
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonPager.java
  
  Index: LuceneCocoonPager.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.lucene.store.*;
  import org.apache.lucene.index.*;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.search.Hits;
  import org.apache.lucene.document.Document;
  
  import java.io.File;
  import java.io.IOException;
  
  import java.util.*;
  
  // implementtion of ListIterator
  /**
   * This class should help you to manage paging of hits.
   *
   * 
   */
  public class LuceneCocoonPager implements ListIterator {
  
    public final static int COUNT_OF_HITS_PER_PAGE_DEFAULT = 5;
    public final static int HITS_INDEX_START_DEFAULT = 0;
  
    /**
     * hits to iterate upon
     */
    private Hits hits;
  
    /**
     * current index of hit to return by next()
     */
    int hitsIndex = HITS_INDEX_START_DEFAULT;
  
    /**
     * maximum count of hits to return by next(), and previous()
     */
    int countOfHitsPerPage = COUNT_OF_HITS_PER_PAGE_DEFAULT;
  
    /**
     */
    public LuceneCocoonPager( Hits hits ) {
      setHits( hits );
    }
    
    public LuceneCocoonPager() {
    }
  
    public void setHits( Hits hits ) {
      this.hits = hits;
      this.hitsIndex = HITS_INDEX_START_DEFAULT;
    }
    
    /**
     * Set count of hits displayed per single page
     */
    public void setCountOfHitsPerPage( int countOfHitsPerPage ) {
      this.countOfHitsPerPage = countOfHitsPerPage;
    }
  
    /**
     * Get count of hits displayed per single page
     */
    public int getCountOfHitsPerPage() {
      return this.countOfHitsPerPage;
    }
    
    /**
     * Caluclate count of pages for displaying all hits
     */
    public int getCountOfPages() {
      int count_of_pages = hits.length() / this.countOfHitsPerPage;
      int remainder = hits.length() % this.countOfHitsPerPage;
      if (remainder != 0) {
        count_of_pages += 1;
      }
      return count_of_pages;
    }
  
    /**
     * Set starting index for retrieving hits
     */
    public int getStartIndex() {
      return this.hitsIndex;
    }
    
    /**
     * Get starting index for retrieving hits
     */
    public void setStartIndex( int start_index ) {
      this.hitsIndex = start_index;
    }
    
    /**
    * Inserts the specified element into the list (optional operation).
    */
    public void add(Object o) throws UnsupportedOperationException {
      throw new UnsupportedOperationException();
    }
  
    /**
    * Returns true if this list iterator has more elements when traversing
    * the list in the forward direction.
    */
    public boolean hasNext() {
      boolean has_next = hitsIndex < hits.length();
      return has_next;
    }
  
    /**
    * Returns true if this list iterator has more elements when traversing
    * the list in the reverse direction.
    */
    public boolean hasPrevious() {
      boolean has_previous = hitsIndex > countOfHitsPerPage;
      return has_previous;
    }
  
    /**
    * Returns the next element in the list.
    */
    public Object next() {
      ArrayList hitsPerPageList = new ArrayList();
      int endIndex = Math.min( hits.length(), hitsIndex + countOfHitsPerPage );
      if (hitsIndex < endIndex) {
        while (hitsIndex < endIndex) {
          try {
            HitWrapper hit_wrapper = new HitWrapper(
              hits.score(hitsIndex),
              hits.doc(hitsIndex) );
            hitsPerPageList.add( hit_wrapper );
          } catch (IOException ioe) {
            throw new NoSuchElementException( "no more hits: " + ioe.getMessage() );
          }
          hitsIndex++;
        }
      } else {
        throw new NoSuchElementException();
      }
      return hitsPerPageList;
    }
  
    /**
    * Returns the index of the element that would be returned by a
    * subsequent call to next.
    */
    public int nextIndex() {
      int next_index = Math.min( hitsIndex, hits.length() );
      return next_index;
    }
  
    /**
    * Returns the previous element in the list.
    */
    public Object previous() {
      ArrayList hitsPerPageList = new ArrayList();
  
      int startIndex = Math.max( 0, hitsIndex - 2 * countOfHitsPerPage );
      int endIndex = Math.min( hits.length() -1, hitsIndex - countOfHitsPerPage );
  
      if (startIndex < endIndex) {
        while (startIndex < endIndex) {
          try {
            HitWrapper hit_wrapper = new HitWrapper(
              hits.score(startIndex),
              hits.doc(startIndex) );
            hitsPerPageList.add( hit_wrapper );
          } catch (IOException ioe) {
            throw new NoSuchElementException( "no more hits: " + ioe.getMessage() );
          }
          startIndex++;
        }
        hitsIndex = endIndex;
      } else {
        throw new NoSuchElementException();
      }
      return hitsPerPageList;
    }
  
    /**
    * Returns the index of the element that would be returned by a
    * subsequent call to previous.
    */
    public int previousIndex() {
      int previous_index = Math.max( 0, hitsIndex - 2 * countOfHitsPerPage );
      return previous_index;
    }
  
    /**
    * Removes from the list the last element that was returned by next or
    * previous (optional operation).
    */
    public void remove() {
      throw new UnsupportedOperationException();
    }
  
    /**
    * Replaces the last element returned by next or previous with the
    * specified element (optional operation).
    */
    public void set(Object o) {
      throw new UnsupportedOperationException();
    }
  
    /**
     * A helper class encapsulating found document, and its score
     */
    public static class HitWrapper {
      float score;
      Document document;
  
      public HitWrapper( float score, Document document ) {
        this.document = document;
        this.score = score;
      }
      public Document getDocument() {
        return document;
      }
      public float getScore() {
        return score;
      }
      public String getField(String field) {
        return document.get(field);
      }
    }
  
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonSearcher.java
  
  Index: LuceneCocoonSearcher.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.framework.component.Component;
  import org.apache.cocoon.ProcessingException;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.search.Hits;
  
  public interface LuceneCocoonSearcher extends Component {
    public final static String ROLE = "org.apache.cocoon.components.lucene.LuceneCocoonSearcher";
    
    public void setAnalyzer( Analyzer analyzer );
    public void setDirectory( Directory directory );
    public Hits search( String query_string, String default_field ) throws ProcessingException;
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneIndexContentHandler.java
  
  Index: LuceneIndexContentHandler.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  /**
   * Title:        lucene
   * Description:  Demo files using lucene indexer&searcher
   * Copyright:    Copyright (c) 2001
   * Company:
   * @author Bernhard Huber
   * @version 1.0
   */
  import org.xml.sax.ContentHandler;
  import org.xml.sax.Locator;
  import org.xml.sax.InputSource;
  import org.xml.sax.SAXException;
  import org.xml.sax.Attributes;
  import org.xml.sax.helpers.AttributesImpl;
  
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.DateField;
  
  import java.util.Stack;
  import java.util.Iterator;
  import java.util.List;
  import java.util.ArrayList;
  
  /**
   * Parse XML and generate lucene document(s)
   */
  public class LuceneIndexContentHandler implements ContentHandler {
    private List documents;
    StringBuffer bodyText;
    private Document bodyDocument;
  
    private Stack elementStack;
    private int indentPos = 0;
  
    public LuceneIndexContentHandler() {
      this.bodyText = new StringBuffer();
      this.bodyDocument = new Document();
      this.documents = new ArrayList();
      this.documents.add( this.bodyDocument );
      elementStack = new Stack();
    }
  
    public List allDocuments() {
      return documents;
    }
  
    public Iterator iterator() {
      return documents.iterator();
    }
  
    private String indent() {
      final String LINE_PREFIX = "                ";
      if (indentPos > LINE_PREFIX.length()) {
        return LINE_PREFIX;
      } else {
        return LINE_PREFIX.substring(0, indentPos);
      }
    }
  
    private void incIndent() {
      indentPos++;
    }
  
    private void decIndent() {
      indentPos--;
    }
  
    public void characters(char[] ch, int start, int length) {
      IndexHelperField tos = (IndexHelperField) elementStack.peek();
      if (ch.length > 0 && start >= 0 && length > 1 && tos != null) {
        String text = new String( ch, start, length );
        tos.appendText( text );
        bodyText.append( text );
      }
    }
  
    public void endDocument() {
      /* empty */
      bodyDocument.add( Field.UnStored( LuceneXMLIndexer.BODY_FIELD, bodyText.toString()) );
    }
  
    public void endElement(String namespaceURI, String localName, String qName) {
      //System.out.println(indent() + "ee: " + "localName " + localName + " " + "qName " + qName);
      IndexHelperField tos = (IndexHelperField) elementStack.pop();
      String text = tos.getText();
      String lname = tos.getLocalFieldName();
      String qname = tos.getQualifiedFieldName();
  
      Document document = new Document();
      boolean add_document = false;
      if (text != null && text.length() > 0) {
        System.out.println( "field qname " + qname );
        document.add( Field.UnStored( qName, text ) );
        add_document = true;
      }
  
      Attributes atts = tos.getAttributes();
      if (atts != null && atts.getLength() > 0) {
        for (int i = 0; i < atts.getLength(); i++ ) {
          String atts_qname = atts.getQName(i);
          String atts_value = atts.getValue(i);
          System.out.println("attribute field " + qname + "@" + atts_qname + ": " + atts_value );
          document.add( Field.UnStored( qname + "@" + atts_qname, atts_value ) );
          add_document = true;
        }
      }
      if (add_document) {
        documents.add( document );
      }
  
      decIndent();
    }
  
    public void endPrefixMapping(String prefix) {
      /* empty */
      System.out.println(indent() + "endPrefixMapping " + prefix );
    }
  
    public void ignorableWhitespace(char[] ch, int start, int length) {
      /* empty */
      System.out.println(indent() + "ignorableWhitspace " );
    }
  
    public void processingInstruction(String target, String data) {
      /* empty */
      System.out.println(indent() + "processingInstruction " + target + " " + data );
    }
  
    public void setDocumentLocator(Locator locator) {
      /* empty */
      System.out.println(indent() + "startDocuementLocator " + locator );
    }
  
    public void skippedEntity(String name) {
      /* empty */
      System.out.println(indent() + "skippedEntity " + name );
    }
  
    public void startDocument() {
      /* empty */
      System.out.println(indent() + "startDocument" );
    }
  
    public void startElement(String namespaceURI, String localName, String qName, Attributes atts) {
      incIndent();
      //System.out.println(indent() + "se: " + "localName " + localName + " " + "qName " + qName);
      IndexHelperField ihf = new IndexHelperField(localName, qName, new AttributesImpl(atts) );
      elementStack.push(ihf);
    }
  
    public void startPrefixMapping(String prefix, String uri) {
      /* empty */
      System.out.println(indent() + "startPrefixMapping: " + prefix + " " + uri );
    }
  }
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneXMLIndexer.java
  
  Index: LuceneXMLIndexer.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.framework.component.Component;
  import org.apache.cocoon.ProcessingException;
  import java.util.List;
  import java.util.Iterator;
  import java.net.URL;
  
  /**
   * This interface specifies generating lucene documents from an 
   * xml content.
   *
   * <p>
   *  The well-known fields of a lucene documents are defined as 
   *  <code>*_FIELD</code> constants.
   * </p>
   * <p>
   *  You may access generated lucene documents via 
   *  <code>allDocuments()</code>, or <code>iterator()</code>.
   * </p>
   * <p>
   *  You trigger the generating of lucene documents via
   *  <code>build()</code>.
   * </p>
   *
   */
  public interface LuceneXMLIndexer extends Component {
    
    public final static String ROLE = "org.apache.cocoon.components.lucene.LuceneXMLIndexer";
    
    /** Field of document's body, ie <tt>body</tt> (mandatory).
    */
    public static final String BODY_FIELD = "body";
    public static final String URL_FIELD = "url";
    public static final String UID_FIELD = "uid";
  
    /**
     * return a list of all lucene documents generated by @see build
     *
     * @return List list of lucene Documents
     */
    public List allDocuments();
  
    /**
     * return an iterator of all lucene documents generated by @see build
     *
     * @return Iterator iterator of lucene Documents
     */
    public Iterator iterator();
  
    /**
     * Build lucenen documents from a URL
     *
     * @param url the content of this url gets indexed.
     */
    public void build(URL url) throws ProcessingException;
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleCocoonCrawlerImpl.java
  
  Index: SimpleCocoonCrawlerImpl.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.excalibur.pool.Recyclable;
  import org.apache.avalon.framework.activity.Disposable;
  
  import org.apache.avalon.framework.configuration.Configurable;
  import org.apache.avalon.framework.configuration.Configuration;
  import org.apache.avalon.framework.configuration.ConfigurationException;
  import org.apache.avalon.framework.logger.AbstractLoggable;
  import org.apache.avalon.framework.parameters.Parameters;
  import org.apache.avalon.framework.thread.ThreadSafe;
  
  import org.apache.cocoon.Constants;
  import org.apache.cocoon.util.Tokenizer;
  
  import org.apache.log.Logger;
  
  import org.apache.regexp.RE;
  import org.apache.regexp.RESyntaxException;
  
  import java.io.*;
  import java.util.*;
  import java.net.*;
  
  /**
   * A simple cocoon crawler.
   */
  public class SimpleCocoonCrawlerImpl extends AbstractLoggable 
    implements CocoonCrawler, Configurable, Disposable, Recyclable {
  
    /**
     * Append this query, for querying the link view of an URL
     */
    private final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
    private final static String LINK_VIEW_QUERY_DEFAULT = "?cocoon-view=links";
    private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
  
    /**
     * Expected content-type of a link view response.
     */
    public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
    public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
    private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
  
    private final static String EXCLUDE_CONFIG = "exclude";
    private HashSet excludeCrawlingURL;
  
    private final static String INCLUDE_CONFIG = "include";
    private HashSet includeCrawlingURL;
  
    private final static String USER_AGENT_CONFIG = "user-agent";
    private final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
    private String userAgent = USER_AGENT_DEFAULT;
    
    private final static String ACCEPT_CONFIG = "accept";
    private final static String ACCEPT_DEFAULT = USER_AGENT_DEFAULT;
    private String accept = ACCEPT_DEFAULT;
    
    private HashSet crawled;
    private HashSet urlsToProcess;
  
    /**
     * configure component
     * allow:
     * <pre><tt>
     * &lt;include&gt;.x&lt;/include&gt; or &lt;include&gt;.xxx, .yyy&lt;/include&gt;
     * &lt;exclude&gt;.x&lt;/exclude&gt; or &lt;exclude&gt;.xxx, .yyy&lt;/exclude&gt;
     * &lt;link-content-type&gt;dfd&lt;/link-content-type&gt;
     * &lt;link-view-query&gt;dfsd&lt;/link-view-query&gt;
     * </tt></pre>
     */
    public void configure( Configuration configuration ) 
      throws ConfigurationException {
  
      Configuration []children;
      children = configuration.getChildren( INCLUDE_CONFIG );
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          String pattern = children[i].getValue();
          try {
            Tokenizer t = new Tokenizer( pattern, ", " );
            while (t.hasMoreTokens()) {
              String tokenized_pattern = t.nextToken();
              this.includeCrawlingURL.add( new RE( tokenized_pattern ) );
            }
          } catch (RESyntaxException rese) {
            getLogger().error( "Cannot create includeing regular-expression for " + pattern, rese );
          }
        }
      }
  
      children = configuration.getChildren( EXCLUDE_CONFIG );
      if (children != null) {
        for (int i = 0; i < children.length; i++) {
          String pattern = children[i].getValue();
          try {
            Tokenizer t = new Tokenizer( pattern, ", " );
            while (t.hasMoreTokens()) {
              String tokenized_pattern = t.nextToken();
              this.excludeCrawlingURL.add( new RE( tokenized_pattern ) );
            }
          } catch (RESyntaxException rese) {
            getLogger().error( "Cannot create excluding regular-expression for " + pattern, rese );
          }
        }
      }
      
      Configuration child;
      String value;
      child = configuration.getChild( LINK_CONTENT_TYPE_CONFIG, false );
      if (child != null) {
         value = child.getValue();
         if (value != null && value.length() > 0) {
           this.linkContentType = value;
         }
      }
      child = configuration.getChild( LINK_VIEW_QUERY_CONFIG, false );
      if (child != null) {
        value = child.getValue();
        if (value != null && value.length() > 0) {
          this.linkViewQuery = value;
        }
      }
  
      child = configuration.getChild( USER_AGENT_CONFIG, false );
      if (child != null) {
        value = child.getValue();
        if (value != null && value.length() > 0) {
          this.userAgent = value;
        }
      }
  
      child = configuration.getChild( ACCEPT_CONFIG, false );
      if (child != null) {
        value = child.getValue();
        if (value != null && value.length() > 0) {
          this.accept = value;
        }
      }
    }
    
    /**
     * dispose at end of life cycle, releasing all resources.
     */
    public void dispose() {
      crawled = null;
      urlsToProcess = null;
      excludeCrawlingURL = null;
      includeCrawlingURL = null;
    }
  
    /**
     * recylcle this object, relasing resources
     */
    public void recycle() {
      crawled = null;
      urlsToProcess = null;
    }
    
    public SimpleCocoonCrawlerImpl() {
      includeCrawlingURL = null;
      excludeCrawlingURL = new HashSet();
      setImageExcludeFromCrawling();
    }
  
    private void setImageExcludeFromCrawling() {
      String []EXCLUDE_FROM_CRAWLING_DEFAULT = {
        "*.\\.gif$",
        "*.\\.png$",
        "*.\\.jpe?g$"
      };
      
      for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++ ) {
        String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
        try {
          excludeCrawlingURL.add( new RE( pattern ) );
        } catch (RESyntaxException rese) {
          getLogger().error( "Cannot create excluding regular-expression for " + pattern, rese );
        }
      }
    }
  
    /** 
     * start crawling
     */
    public void crawl( URL url ) {
      crawled = new HashSet();
      urlsToProcess = new HashSet();
      
      urlsToProcess.add( url );
    }
  
    public Iterator iterator() {
      return new CocoonCrawlerIterator(this);
    }
  
    public static class CocoonCrawlerIterator implements Iterator {
      private SimpleCocoonCrawlerImpl cocoonCrawler;
      CocoonCrawlerIterator( SimpleCocoonCrawlerImpl cocoonCrawler ) {
        this.cocoonCrawler = cocoonCrawler;
      }
      
      /**
       * check if crawling is finished.
       */
      public boolean hasNext() {
        return cocoonCrawler.urlsToProcess.size() > 0;
      }
  
      /**
       * return the next URL
       */
      public Object next() {
        URL url = null;
        Iterator i = cocoonCrawler.urlsToProcess.iterator();
        if (i.hasNext()) {
          url = (URL)i.next();
  
          cocoonCrawler.urlsToProcess.remove( url );
  
          List url_links = cocoonCrawler.getLinks( url );
          if (url_links != null) {
            cocoonCrawler.urlsToProcess.addAll( url_links );
          }
        }
        return url;
      }
  
      /**
       * remove is not implemented
       * @exception UnsupportedOperationException is always thrown
       */
      public void remove() {
        throw new UnsupportedOperationException( "remove is not implemented" );
      }
    }
  
    /**
     * list of links from the parent url
     *
     * @param url
     * @return List of URLs
     */
    private List getLinks( URL url ) {
      ArrayList url_links = null;
  
      if (!isIncludedURL( url.toString())) {
        return null;
      }
      // don't try to get links for url which is excluded
      if (isExcludedURL( url.toString())) {
        return null;
      }
  
      // don't try to get links for url which has been crawled already
      if (crawled.contains( url.toString() )) {
        return null;
      }
  
      // mark it as crawled
      crawled.add( url.toString() );
  
      if (getLogger().isDebugEnabled()) {
        getLogger().debug( "Get links of URL: " + url.toString() );
      }
      
      // get links of url
      try {
        URL links_url = new URL( url, url.getPath() +  linkViewQuery );
        URLConnection links_url_connection = links_url.openConnection();
        InputStream is = links_url_connection.getInputStream();
        BufferedReader br = new BufferedReader( new InputStreamReader( is ) );
  
        String content_type = links_url_connection.getContentType();
        if (getLogger().isDebugEnabled()) {
          getLogger().debug( "Content-type: " + content_type );
        }
  
        if (content_type.equals( linkContentType )) {
          url_links = new ArrayList();
  
          // content is supposed to be a list of links,
          // relative to current URL
          String line;
          while ((line = br.readLine()) != null) {
            URL new_url = new URL( url, line );
            boolean add_url = true;
            add_url &= !crawled.contains( new_url.toString() );
            add_url &= isIncludedURL( new_url.toString() );
            add_url &= !isExcludedURL( new_url.toString() );
            if (add_url) {
              if (getLogger().isDebugEnabled()) {
                getLogger().debug( "Add URL: " + new_url.toString() );
              }
              url_links.add( new_url );
            }
          }
          // now we have a list of URL which should be examined
        }
      } catch (IOException ioe) {
        getLogger().warn( "Problems get links of " + url, ioe );
      }
      return url_links;
    }
  
    /**
     * check if URL is a candidate for indexing
     */
    private boolean isExcludedURL( String url ) {
      // by default include URL for crawling
      if (excludeCrawlingURL == null) {
        return false;
      }
      
      final String s = url.toString();
      Iterator i = excludeCrawlingURL.iterator();
      while (i.hasNext()) {
        RE pattern = (RE)i.next();
        if (pattern.match( s )) {
          return true;
        }
      }
      return false;
    }
    
    /**
     * check if URL is a candidate for indexing
     */
    private boolean isIncludedURL( String url ) {
      // by default include URL for crawling
      if (excludeCrawlingURL == null) {
        return true;
      }
      
      final String s = url.toString();
      Iterator i = includeCrawlingURL.iterator();
      while (i.hasNext()) {
        RE pattern = (RE)i.next();
        if (pattern.match( s )) {
          return true;
        }
      }
      return false;
    }
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleLuceneCocoonIndexerImpl.java
  
  Index: SimpleLuceneCocoonIndexerImpl.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.excalibur.pool.Recyclable;
  import org.apache.avalon.framework.activity.Disposable;
  
  import org.apache.avalon.framework.component.ComponentException;
  import org.apache.avalon.framework.component.ComponentManager;
  import org.apache.avalon.framework.component.Composable;
  
  import org.apache.avalon.framework.configuration.Configurable;
  import org.apache.avalon.framework.configuration.Configuration;
  import org.apache.avalon.framework.configuration.ConfigurationException;
  
  import org.apache.cocoon.ProcessingException;
  
  import org.apache.avalon.framework.logger.AbstractLoggable;
  
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.DateField;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.index.*;
  
  import java.io.IOException;
  import java.util.Iterator;
  import java.net.URL;
  
  /**
   */
  public class SimpleLuceneCocoonIndexerImpl extends AbstractLoggable
    implements LuceneCocoonIndexer, Configurable, Composable, Disposable {
    
    protected final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
    protected final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
    private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT;
  
    protected final static String DIRECTORY_CONFIG = "directory";
    protected final static String DIRECTORY_DEFAULT = null;
    private String directoryDefault = DIRECTORY_DEFAULT;
  
    private final static String MERGE_FACTOR_CONFIG = "merge-factor";
    private final static int MERGE_FACTOR_DEFAULT = 20;
    private int mergeFactor = MERGE_FACTOR_DEFAULT;
    
    /** The component manager instance */
    protected ComponentManager manager = null;
  
    Analyzer analyzer;
  
    public void configure( Configuration conf ) throws ConfigurationException {
      Configuration child;
      String value;
      
      child = conf.getChild( ANALYZER_CLASSNAME_CONFIG, false );
      if (child != null) {
        value = conf.getValue( ANALYZER_CLASSNAME_DEFAULT );
        if (value != null) {
          analyzerClassnameDefault = value;
        }
      }
      child = conf.getChild( MERGE_FACTOR_CONFIG, false );
      if (child != null) {
        mergeFactor = conf.getValueAsInteger( MERGE_FACTOR_DEFAULT );
      }
    }
    
    /**
     * Set the current <code>ComponentManager</code> instance used by this
     * <code>Composable</code>.
     */
    public void compose(ComponentManager manager) throws ComponentException {
      this.manager=manager;
    }
    
    public void dispose() {
    }
    
    public void setAnalyzer( Analyzer analyzer ) {
      this.analyzer = analyzer;
    }
  
    /**
     * index content of base_url, index content of links from base_url.
     *
     * @param index the lucene store to write the index to
     * @param create iff true create, or overwrite existing index, else
     *   update existing index.
     * @param base_url index content of base_url, and crawl through all its
     *   links recursivly.
     */
    public void index( Directory index, boolean create, URL base_url  ) 
      throws ProcessingException {
      
      IndexWriter writer = null;
      LuceneXMLIndexer lxi = null;
      CocoonCrawler cocoonCrawler = null;
      
      try {
        lxi = (LuceneXMLIndexer)manager.lookup( LuceneXMLIndexer.ROLE );
  
        writer = new IndexWriter( index, analyzer, create );
        writer.mergeFactor = this.mergeFactor;
    
        cocoonCrawler = (CocoonCrawler)manager.lookup( CocoonCrawler.ROLE );
        cocoonCrawler.crawl( base_url );
        
        Iterator cocoonCrawlerIterator = cocoonCrawler.iterator();
        while (cocoonCrawlerIterator.hasNext()) {
          URL crawl_url = (URL)cocoonCrawlerIterator.next();
  
          if (!crawl_url.getHost().equals( base_url.getHost() ) ||
            crawl_url.getPort() != base_url.getPort() ) {
  
            // skip urls using different host, or port than host,
            // or port of base url
            System.out.println( "Skipping carwling URL " + crawl_url.toString() +
              " as base_url is " + base_url.toString() );
            continue;
          }
  
          // build lucene documents from the content of the crawl_url
          lxi.build( crawl_url );
          Iterator i = lxi.iterator();
          
          // add all built lucene documents
          while (i.hasNext()) {
            Document document = (Document)i.next();
            writer.addDocument( document );
          }
        }
        // optimize it
        writer.optimize();
      } catch (IOException ioe) {
        throw new ProcessingException( "IOException in index()", ioe );
      } catch (ComponentException ce) {
        throw new ProcessingException( "ComponentException in index()", ce );
      } finally {
        if (writer != null) {
          try {
            writer.close();
          } catch (IOException ioe) {
          }
          writer = null;
        }
        
        if (lxi != null) {
          manager.release( lxi );
          lxi = null;
        }
        if (cocoonCrawler != null) {
          manager.release( cocoonCrawler );
          cocoonCrawler = null;
        }
      }
    }
  
    class DocumentDeletableIterator {
      private IndexReader reader;		  // existing index
      private TermEnum uidIter;		  // document id iterator
      
      public DocumentDeletableIterator( Directory directory ) throws IOException {
        reader = IndexReader.open( directory );		  // open existing index
        uidIter = reader.terms( new Term("uid", "")); // init uid iterator
      }
      
      protected void finalize() throws Throwable {
        super.finalize();
        if (uidIter != null) {
          uidIter.close();				  // close uid iterator
          uidIter = null;
        }
        if (reader != null) {
          reader.close();				  // close existing index
          reader = null;
        }
      }
  
      public void deleteAllStaleDocuments() throws IOException {
        while (uidIter.term() != null && uidIter.term().field() == "uid") {
          reader.delete(uidIter.term());
          uidIter.next();
        }
      }
      
      public void deleteModifiedDocuments( String uid ) throws IOException {
        while (documentHasBeenModified( uidIter.term(), uid )) {
          reader.delete( uidIter.term() );
          uidIter.next();
        }
        if (documentHasNotBeenModified( uidIter.term(), uid )) {
          uidIter.next();
        }
      }
  
      boolean documentIsDeletable( Term term ) {
        return term != null && term.field() == "uid";
      }
      
      boolean documentHasBeenModified( Term term, String uid ) {
        return documentIsDeletable( term )&&
               term.text().compareTo(uid) < 0;
      }
      boolean documentHasNotBeenModified( Term term, String uid ) {
        return documentIsDeletable( term ) &&
               term.text().compareTo(uid) == 0;
      }
    }
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleLuceneCocoonSearcherImpl.java
  
  Index: SimpleLuceneCocoonSearcherImpl.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.excalibur.pool.Recyclable;
  import org.apache.avalon.framework.activity.Disposable;
  
  import org.apache.avalon.framework.component.ComponentException;
  import org.apache.avalon.framework.component.ComponentManager;
  import org.apache.avalon.framework.component.Composable;
  
  import org.apache.avalon.framework.configuration.Configurable;
  import org.apache.avalon.framework.configuration.Configuration;
  import org.apache.avalon.framework.configuration.ConfigurationException;
  
  import org.apache.cocoon.ProcessingException;
  
  import org.apache.avalon.framework.logger.AbstractLoggable;
  
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.DateField;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.store.Directory;
  import org.apache.lucene.index.*;
  import org.apache.lucene.search.*;
  import org.apache.lucene.queryParser.*;
  
  import java.io.IOException;
  import java.util.Iterator;
  import java.net.URL;
  
  /**
   */
  public class SimpleLuceneCocoonSearcherImpl extends AbstractLoggable
    implements LuceneCocoonSearcher, Configurable, Composable, Disposable, Recyclable {
  
    protected final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
    protected final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
    private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT;
    
    protected final static String DEFAULT_SEARCH_FIELD_CONFIG = "default-search-field";
    protected final static String DEFAULT_SEARCH_FIELD_DEFAULT = "body";
    private String defaultSearchFieldDefault = DEFAULT_SEARCH_FIELD_DEFAULT;
    
    protected final static String DEFAULT_QUERY_CONFIG = "default-query";
    protected final static String DEFAULT_QUERY_DEFAULT = null;
    private String defaultQueryDefault = DEFAULT_QUERY_DEFAULT;
    
    protected final static String QUERYPARSER_CLASSNAME_CONFIG = "queryparser-classname";
    protected final static String QUERYPARSER_CLASSNAME_DEFAULT = "org.apache.lucene.queryParser.QueryParser";
    private String queryparserClassnameDefault = QUERYPARSER_CLASSNAME_DEFAULT;
  
    protected final static String DIRECTORY_CONFIG = "directory";
    protected final static String DIRECTORY_DEFAULT = null;
    private String directoryDefault = DIRECTORY_DEFAULT;
  
    
    /** The component manager instance */
    protected ComponentManager manager=null;
  
    private Analyzer analyzer;
    private Directory directory;
    private IndexSearcher indexSearcher;
  
    private IndexReaderCache indexReaderCache;
    
    /**
     * configure
     */
    public void configure( Configuration conf ) throws ConfigurationException {
      Configuration child;
      String value;
      
      child = conf.getChild( ANALYZER_CLASSNAME_CONFIG, false );
      if (child != null) {
        value = conf.getValue( ANALYZER_CLASSNAME_DEFAULT );
        if (value != null) {
          analyzerClassnameDefault = value;
        }
      }
      
      child = conf.getChild( DEFAULT_SEARCH_FIELD_CONFIG, false );
      if (child != null) {
        value = conf.getValue( DEFAULT_SEARCH_FIELD_DEFAULT );
        if (value != null) {
          defaultSearchFieldDefault = value;
        }
      }
      
      child = conf.getChild( DEFAULT_QUERY_CONFIG, false );
      if (child != null) {
        value = conf.getValue( DEFAULT_QUERY_DEFAULT );
        if (value != null) {
          defaultQueryDefault = value;
        }
      }
  
      child = conf.getChild( QUERYPARSER_CLASSNAME_CONFIG, false );
      if (child != null) {
        value = conf.getValue( QUERYPARSER_CLASSNAME_DEFAULT );
        if (value != null) {
          queryparserClassnameDefault = value;
        }
      }
  
      child = conf.getChild( DIRECTORY_CONFIG, false );
      if (child != null) {
        value = conf.getValue( DIRECTORY_DEFAULT );
        if (value != null) {
          directoryDefault = value;
        }
      }
    }
    
    /**
     * Set the current <code>ComponentManager</code> instance used by this
     * <code>Composable</code>.
     */
    public void compose(ComponentManager manager) throws ComponentException {
      this.manager=manager;
    }
    
    public void dispose() {
      releaseIndexSearcher();
      releaseIndexReaderCache();
    }
    
    public void recycle() {
      releaseIndexSearcher();
    }
    
    private void releaseIndexSearcher() {
      if (indexSearcher != null) {
        try {
          indexSearcher.close();
        } catch (IOException ioe) {
          // ignore it
        }
        indexSearcher = null;
      }
    }
    
    private void releaseIndexReaderCache() {
      if (indexReaderCache != null) {
        indexReaderCache = null;
      }
    }
    /**
     * set an analyzer, overriding the analyzerClassnameDefault.
     */
    public void setAnalyzer( Analyzer analyzer ) {
      this.analyzer = analyzer;
    }
    
    public void setDirectory( Directory directory ) {
      this.directory = directory;
      indexReaderCache = null;
    }
    
    public Hits search( String query_string, String default_field ) throws ProcessingException {
      Hits hits = null;
      try {
        Query query = QueryParser.parse( query_string, default_field, analyzer);
        
        // release index searcher for each new search
        releaseIndexSearcher();
        
        IndexSearcher indexSearcher = new IndexSearcher( getReader() );
        hits = indexSearcher.search( query );
        // do not close indexSearcher now, as using hits needs an
        // opened indexSearcher indexSearcher.close();
      } catch (ParseException pe) {
        throw new ProcessingException( "Cannot parse query " + query_string, pe );
      } catch (IOException ioe) {
        throw new ProcessingException( "Cannot access hits", ioe );
      }
      return hits;
    }
  
    /**
     * This class should help to minimise usage of IndexReaders.
     *
     */
    static class IndexReaderCache {  
      private Directory directory;
      private IndexReader indexReader;
      private long lastModified;
      
      IndexReaderCache( Directory directory ) {
        this.directory = directory;
      }
      
      protected void finalize() throws Throwable {
        if (indexReader != null) {
          indexReader.close();
          indexReader = null;
        }
      }
      
      void setIndexReader( IndexReader reader ) throws IOException {
        if (indexReader != null) {
          indexReader.close();
        }
        indexReader = reader;
        lastModified = indexReader.lastModified(this.directory);
      }
      
      IndexReader getIndexReader() {
        return indexReader;
      }
      
      boolean indexReaderIsValid() throws IOException {
        return indexReader != null &&
          indexReader.lastModified(this.directory) == lastModified;
      }
    }
    
    public IndexReader getReader() throws IOException {
      if (indexReaderCache == null) {
        indexReaderCache = new IndexReaderCache( directory );
      }
      IndexReader indexReader = null;
      if (indexReaderCache.indexReaderIsValid()) {
          indexReader = indexReaderCache.getIndexReader();
      } else {
        indexReader = IndexReader.open( this.directory );
        indexReaderCache.setIndexReader( indexReader );
      }
      return indexReader;
    }
  }
  
  
  
  
  1.1                  xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleLuceneXMLIndexerImpl.java
  
  Index: SimpleLuceneXMLIndexerImpl.java
  ===================================================================
  /*****************************************************************************
   * Copyright (C) The Apache Software Foundation. All rights reserved.        *
   * ------------------------------------------------------------------------- *
   * This software is published under the terms of the Apache Software License *
   * version 1.1, a copy of which has been included  with this distribution in *
   * the LICENSE file.                                                         *
   *****************************************************************************/
  package org.apache.cocoon.components.lucene;
  
  import org.apache.avalon.framework.activity.Disposable;
  import org.apache.avalon.framework.configuration.Configurable;
  import org.apache.avalon.framework.configuration.Configuration;
  import org.apache.avalon.framework.configuration.ConfigurationException;
  import org.apache.avalon.framework.component.ComponentException;
  import org.apache.avalon.framework.component.ComponentManager;
  import org.apache.avalon.framework.component.Composable;
  import org.apache.avalon.framework.logger.AbstractLoggable;
  import org.apache.avalon.framework.parameters.Parameters;
  import org.apache.avalon.framework.thread.ThreadSafe;
  
  import org.apache.avalon.framework.logger.AbstractLoggable;
  
  import org.apache.cocoon.components.parser.Parser;
  import org.apache.cocoon.ProcessingException;
  import org.apache.cocoon.environment.Source;
  import org.apache.cocoon.environment.SourceResolver;
  
  import java.io.*;
  
  import java.util.HashSet;
  import java.util.Map;
  import java.util.Iterator;
  import java.util.List;
  import java.util.ArrayList;
  
  import java.net.URL;
  import java.net.URLConnection;
  
  import javax.xml.parsers.*;
  import org.xml.sax.ContentHandler;
  import org.xml.sax.ErrorHandler;
  import org.xml.sax.Locator;
  import org.xml.sax.InputSource;
  import org.xml.sax.SAXException;
  import org.xml.sax.SAXParseException;
  import org.xml.sax.XMLReader;
  import org.xml.sax.Attributes;
  
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.document.DateField;
  
  /**
   * A simple class building lucene documents from xml content.
   */
  public class SimpleLuceneXMLIndexerImpl extends AbstractLoggable 
    implements LuceneXMLIndexer, Configurable, Composable {
  
    /**
     * list of lucene Document objects
     */
    List documents;
    
    /**
     * The component manager instance 
     */
    protected ComponentManager manager=null;
  
    /**
     * append this string to the url in order to get the 
     * content view of the url
     */
    final String CONTENT_QUERY = "?cocoon-view=content";
    
    /**
     * set of allowed content types
     */
    final HashSet allowedContentType;
  
    /**
     */
    public SimpleLuceneXMLIndexerImpl() {
      documents = null;
  
      allowedContentType = new HashSet();
      allowedContentType.add( "text/xml" );
      allowedContentType.add( "text/xhtml" );
    }
  
    /**
     * configure 
     */
    public void configure(Configuration conf) throws ConfigurationException {
    }
  
    /**
     * Set the current <code>ComponentManager</code> instance used by this
     * <code>Composable</code>.
     */
    public void compose(ComponentManager manager) throws ComponentException {
      this.manager = manager;
    }
  
  
    /**
     * return a list of all lucene documents generated by @see build
     *
     * @return List list of lucene Documents
     */
    public List allDocuments() {
      return documents;
    }
  
    /**
     * return an iterator of all lucene documents generated by @see build
     *
     * @return Iterator iterator of lucene Documents
     */
    public Iterator iterator() {
      if (documents == null) {
        return new ArrayList().iterator();
      }
      return documents.iterator();
    }
  
    /**
     * Build lucenen documents from a URL
     *
     * @param url the content of this url gets indexed.
     */
    public void build(URL url)
    throws ProcessingException {
  
      try {
        URL contentURL = new URL(url, url.getPath() + CONTENT_QUERY );
        URLConnection contentURLConnection = contentURL.openConnection();
        String contentType = contentURLConnection.getContentType();
        if (contentType != null &&
          allowedContentType.contains( contentType )) {
    
          LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler();
          indexDocument( contentURLConnection, luceneIndexContentHandler );
          //
          // document is parsed
          //
          Iterator it = luceneIndexContentHandler.iterator();
          while (it.hasNext()) {
            Document d = (Document)it.next();
            d.add(Field.UnIndexed( URL_FIELD, url.toString()));
            // store ... false, index ... true, token ... false
            d.add(new Field( UID_FIELD, uid(contentURLConnection), false, true, false));
          }
          documents = luceneIndexContentHandler.allDocuments();
        }
      } catch (IOException ioe) {
        throw new ProcessingException( "Cannot read URL " + url, ioe );
      } finally {
      }
    }
  
    /**
     * index input stream producing lucene Documents
     *
     * @param contentURLConnection the xml content which should get indexed.
     * @param luceneIndexContentHandler ContentHandler for generating 
     *   a lucene Document from XML content.
     */
    private void indexDocument( URLConnection contentURLConnection,
      LuceneIndexContentHandler luceneIndexContentHandler )
    throws ProcessingException {
      
      InputStream is = null;
      InputSource in = null;
      Parser parser = null;
      
      try {
        is = contentURLConnection.getInputStream();
        in = new InputSource(is);
        
        // get an XML parser
        parser = (Parser)this.manager.lookup(Parser.ROLE);
        //reader.setErrorHandler(new CocoonErrorHandler());
        parser.setContentHandler( luceneIndexContentHandler );
        parser.parse(in);
        //
        // document is parsed
        //
      } catch (IOException ioe) {
        throw new ProcessingException( "Cannot read!", ioe );
      } catch (SAXException saxe) {
        throw new ProcessingException("Cannot parse!", saxe);
      } catch (ComponentException ce) {
        throw new ProcessingException( "Cannot lookup xml parser!", ce );
      } finally {
        if (parser != null) this.manager.release(parser);
      }
    }
    
    /**
     * return a unique uid of a url connection
     *
     * @return String unique uid of a urlConnection
     */
    private String uid( URLConnection urlConnection ) {
      // Append path and date into a string in such a way that lexicographic
      // sorting gives the same results as a walk of the file hierarchy.  Thus
      // null (\u0000) is used both to separate directory components and to
      // separate the path from the date.
      return urlConnection.toString().replace('/', '\u0000') +
        "\u0000" +
        DateField.timeToString(urlConnection.getLastModified());
    }
  }
  
  
  
  

----------------------------------------------------------------------
In case of troubles, e-mail:     webmaster@xml.apache.org
To unsubscribe, e-mail:          cocoon-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: cocoon-cvs-help@xml.apache.org