You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@cocoon.apache.org by cr...@apache.org on 2003/07/20 05:33:43 UTC

cvs commit: cocoon-2.1/src/blocks/lucene/java/org/apache/cocoon/transformation LuceneIndexTransformer.java

crossley    2003/07/19 20:33:42

  Modified:    .        status.xml
               src/blocks/lucene/java/org/apache/cocoon/transformation
                        LuceneIndexTransformer.java
  Log:
  - Propagate the lucene:* elements to the result tree
  - Added an "elapsed-time" attribute to the lucene:document elements output.
  - So now possible to transform the results into a useful report about the
  indexing operation.
  - Added constants for the 3 states of the parser:
  STATE_GROUND=0, STATE_QUERY=1, STATE_DOCUMENT=2
  - Allow "incremental" indexing, fixing a bug which added multiple versions
  of the same document (with the same UID) to the index, if the document was
  indexed more than once.
  - Changed the semantics of the "create" flag slightly, in that an index is
  always created if it is missing, even if the "create" flag is false.
  
  PR: 21557
  Submitted by: Conal Tuohy conal<AT>nzetc.org
  
  Revision  Changes    Path
  1.95      +12 -1     cocoon-2.1/status.xml
  
  Index: status.xml
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/status.xml,v
  retrieving revision 1.94
  retrieving revision 1.95
  diff -u -r1.94 -r1.95
  --- status.xml	18 Jul 2003 14:46:30 -0000	1.94
  +++ status.xml	20 Jul 2003 03:33:42 -0000	1.95
  @@ -163,6 +163,11 @@
       </action>
   
       <action context="code">
  +      Lucene is writing info to stdout when searching.
  +    </action>
  +
  +    <action context="code">
  +<!-- FIXME: remove ... this already above -->
         For 2.1: Make a guide on how to upgrade Cocoon, and see how this can be eased.
       </action>
   
  @@ -184,6 +189,12 @@
     <changes>
   
    <release version="@version@" date="@date@">
  +  <action dev="DC" type="fix" fixes-bug="21557" due-to="Conal Tuohy" due-to-email="conal@nzetc.org">
  +    Various fixes and enhancements to Lucene search. Propagate the lucene:*
  +    elements to the result tree and add "elapsed-time" attribute, so now
  +    possible to transform the results into a useful report about indexing.
  +    Fix "incremental" indexing.
  +  </action>
     <action dev="GR" type="add">
       Added CLOB support in SQLTransformer.
     </action>
  
  
  
  1.5       +135 -44   cocoon-2.1/src/blocks/lucene/java/org/apache/cocoon/transformation/LuceneIndexTransformer.java
  
  Index: LuceneIndexTransformer.java
  ===================================================================
  RCS file: /home/cvs/cocoon-2.1/src/blocks/lucene/java/org/apache/cocoon/transformation/LuceneIndexTransformer.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- LuceneIndexTransformer.java	19 Mar 2003 15:42:17 -0000	1.4
  +++ LuceneIndexTransformer.java	20 Jul 2003 03:33:42 -0000	1.5
  @@ -82,6 +82,8 @@
   import org.apache.lucene.document.Document;
   import org.apache.lucene.document.Field;
   import org.apache.lucene.index.IndexWriter;
  +import org.apache.lucene.index.IndexReader;
  +import org.apache.lucene.index.Term;
   import org.apache.lucene.store.Directory;
   import org.xml.sax.Attributes;
   import org.xml.sax.SAXException;
  @@ -89,9 +91,12 @@
   
   /**
    * A lucene index creation transformer.
  - * <p>FIXME: Write Documentation.</p>
  + * <p>See <a href="http://wiki.cocoondev.org/Wiki.jsp?page=LuceneIndexTransformer">LuceneIndexTransformer</a>
  + * documentation on the Cocoon Wiki.</p>
  + * <p>FIXME: Write more documentation.</p>
    *
    * @author <a href="mailto:vgritsenko@apache.org">Vadim Gritsenko</a>
  + * @author <a href="mailto:conal@nzetc.org">Conal Tuohy</a>
    * @version CVS $Id$
    */
   public class LuceneIndexTransformer extends AbstractTransformer
  @@ -117,6 +122,13 @@
       public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
       public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
       public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
  +    public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
  +    public static final String CDATA = "CDATA";
  +    
  +    // The 3 states of the state machine
  +    private static final int STATE_GROUND = 0; // initial or "ground" state
  +    private static final int STATE_QUERY = 1; // processing a lucene:index (Query) element
  +    private static final int STATE_DOCUMENT = 2; // processing a lucene:document element
   
       // Initialization time variables
       protected ComponentManager manager = null;
  @@ -129,17 +141,25 @@
   
       // Invocation time parameters values
       private String analyzerClassname;
  -    private String directory;
  +    private String directoryName;
       private int mergeFactor;
   
   
       // Runtime variables
       private int processing;
  +    private boolean createIndex = false;
       private IndexWriter writer;
       private StringBuffer bodyText;
       private Document bodyDocument;
       private String bodyDocumentURL;
       private Stack elementStack = new Stack();
  +    /**
  +     * Storage for the document element's attributes until the document
  +     * has been indexed, so that they can be copied to the output
  +     * along with a boolean <code>indexed</code> attribute.
  +     */
  +    private AttributesImpl documentAttributes; 
  +    private long documentStartTime;
   
   
       private static String uid(String url) {
  @@ -163,7 +183,7 @@
       throws ProcessingException, SAXException, IOException {
           // We don't need all this stuff
           this.analyzerClassname = parameters.getParameter(ANALYZER_CLASSNAME_PARAMETER, analyzerClassnameDefault);
  -        this.directory = parameters.getParameter(DIRECTORY_PARAMETER, directoryDefault);
  +        this.directoryName = parameters.getParameter(DIRECTORY_PARAMETER, directoryDefault);
           this.mergeFactor = parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, mergeFactorDefault);
       }
   
  @@ -179,7 +199,7 @@
       }
   
       public void recycle() {
  -        this.processing = 0;
  +        this.processing = STATE_GROUND;
           if (this.writer != null) {
               try { this.writer.close(); } catch (IOException ioe) { }
               this.writer = null;
  @@ -230,7 +250,7 @@
        * @param uri The Namespace URI the prefix is mapped to.
        */
       public void startPrefixMapping(String prefix, String uri) throws SAXException {
  -        if (processing == 0) {
  +        if (processing == STATE_GROUND) {
               super.startPrefixMapping(prefix,uri);
           }
       }
  @@ -241,7 +261,7 @@
        * @param prefix The prefix that was being mapping.
        */
       public void endPrefixMapping(String prefix) throws SAXException {
  -        if (processing == 0) {
  +        if (processing == STATE_GROUND) {
               super.endPrefixMapping(prefix);
           }
       }
  @@ -249,59 +269,68 @@
       public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
           throws SAXException {
   
  -        if (processing == 0) {
  +        if (processing == STATE_GROUND) {
               if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)){
                   String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
  -                boolean bCreate = sCreate != null &&
  +                createIndex = sCreate != null &&
                       (sCreate.equalsIgnoreCase("yes") || sCreate.equalsIgnoreCase("true"));
   
  -                String analyzerClassname =
  +                analyzerClassname =
                       atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
                   if (analyzerClassname == null)
                       analyzerClassname = this.analyzerClassname;
  -                Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname);
   
                   String sMergeFactor =
                       atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
  -                int mergeFactor = this.mergeFactor;
  +                mergeFactor = this.mergeFactor;
                   if (sMergeFactor != null)
                       mergeFactor = Integer.parseInt(sMergeFactor);
   
  -                String directoryName =
  +                String attributeDirectoryName =
                       atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
  -                if (directoryName == null)
  -                    directoryName = this.directory;
  +                if (attributeDirectoryName != null)
  +                    this.directoryName = attributeDirectoryName;
   
                   // System.out.println("QUERY Create=" + bCreate + ", Directory=" + directoryName + ", Analyzer=" + analyzerClassname);
  -                try {
  -                    Directory directory = LuceneCocoonHelper.getDirectory(
  -                        new File(workDir, directoryName), bCreate);
  -
  -                    writer = new IndexWriter(directory, analyzer, bCreate);
  -                    writer.mergeFactor = mergeFactor;
  -                } catch (IOException e) {
  -                    throw new SAXException(e);
  +                if (!createIndex) {
  +                    // Not asked to create the index - but check if this is necessary anyway:
  +                    try {
  +                        IndexReader reader = openReader();
  +                        reader.close();
  +                    } catch (IOException ioe) {
  +                        // couldn't open the index - so recreate it
  +                        createIndex = true;
  +                    }
                   }
  -
  -                processing = 1;
  +                // propagate the lucene:index to the next stage in the pipeline
  +                super.startElement(namespaceURI, localName, qName, atts);
  +                processing = STATE_QUERY;
               } else {
                   super.startElement(namespaceURI, localName, qName, atts);
               }
  -        } else if (processing == 1) {
  +        } else if (processing == STATE_QUERY) {
  +            // processing a lucene:index - expecting a lucene:document
               if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)){
                   this.bodyDocumentURL = atts.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
                   if (this.bodyDocumentURL == null)
                       throw new SAXException("<lucene:document> must have @url attribute");
   
                   // System.out.println("  DOCUMENT URL=" + bodyDocumentURL);
  +                
  +                // Remember the time the document indexing began
  +                this.documentStartTime = System.currentTimeMillis();
  +                // remember these attributes so they can be passed on to the next stage in the pipeline,
  +                // when this document element is ended.
  +                //System.out.println("lucene:document startElement: " + namespaceURI + ", " + localName + ", " + qName);
  +                this.documentAttributes = new AttributesImpl(atts);
                   this.bodyText = new StringBuffer();
                   this.bodyDocument = new Document();
                   this.elementStack.clear();
  -                processing = 2;
  +                processing = STATE_DOCUMENT;
               } else {
  -                throw new SAXException("<lucene:query> element can contain only <lucene:document> elements!");
  +                throw new SAXException("<lucene:index> element can contain only <lucene:document> elements!");
               }
  -        } else if (processing == 2) {
  +        } else if (processing == STATE_DOCUMENT) {
               elementStack.push(new IndexHelperField(localName, new AttributesImpl(atts)));
           }
       }
  @@ -309,44 +338,59 @@
       public void endElement(String namespaceURI, String localName, String qName)
           throws SAXException {
   
  -        if (processing == 1) {
  +        if (processing == STATE_QUERY) {
               if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) {
                   // End query processing
                   // System.out.println("QUERY END!");
                   try {
  +                    if (this.writer == null)
  +                        openWriter();
                       this.writer.optimize();
                       this.writer.close();
                       this.writer = null;
                   } catch (IOException e) {
                       throw new SAXException(e);
                   }
  -
  -                this.processing = 0;
  +                // propagate the query element to the next stage in the pipeline
  +                super.endElement(namespaceURI, localName, qName);
  +                this.processing = STATE_GROUND;
               } else {
  -                throw new SAXException("</lucene:query> was expected!");
  +                throw new SAXException("</lucene:index> was expected!");
               }
  -        } else if (processing == 2) {
  +        } else if (processing == STATE_DOCUMENT) {
               if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
                   // End document processing
  +                // System.out.println("  DOCUMENT END!");
                   this.bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
  -                System.out.println("    DOCUMENT BODY=" + this.bodyText);
  +                //System.out.println("    DOCUMENT BODY=" + this.bodyText);
                   this.bodyText = null;
   
                   this.bodyDocument.add(Field.UnIndexed(LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
                   // store: false, index: true, tokenize: false
                   this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL), false, true, false));
                   // System.out.println("    DOCUMENT UID=" + uid(this.bodyDocumentURL));
  -                this.bodyDocumentURL = null;
  -                // System.out.println("  DOCUMENT END!");
                   try {
  -                    this.writer.addDocument(this.bodyDocument);
  -                    this.bodyDocument = null;
  +                    reindexDocument();
                   } catch (IOException e) {
                       throw new SAXException(e);
                   }
  +                this.bodyDocumentURL = null;
   
  -                this.processing = 1;
  -            } else {
  +                // propagate the lucene:document element to the next stage in the pipeline
  +                //System.out.println("lucene:document endElement: " + namespaceURI + ", " + localName + ", " + qName);
  +                long elapsedTime = System.currentTimeMillis() - this.documentStartTime;
  +                //documentAttributes = new AttributesImpl();
  +                this.documentAttributes.addAttribute(
  +                    "", 
  +                    LUCENE_ELAPSED_TIME_ATTRIBUTE, 
  +                    LUCENE_ELAPSED_TIME_ATTRIBUTE, 
  +                    CDATA, 
  +                    String.valueOf(elapsedTime)
  +                );
  +                super.startElement(namespaceURI, localName, qName, this.documentAttributes);
  +                super.endElement(namespaceURI, localName, qName);
  +                this.processing = STATE_QUERY;
  +            } else {                
                   // End element processing
                   IndexHelperField tos = (IndexHelperField) elementStack.pop();
                   StringBuffer text = tos.getText();
  @@ -389,16 +433,63 @@
       public void characters(char[] ch, int start, int length)
           throws SAXException {
   
  -        if (processing == 2 && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) {
  +        if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) {
               String text = new String(ch, start, length);
               ((IndexHelperField) elementStack.peek()).append(text);
               bodyText.append(text);
               bodyText.append(' ');
  -        } else if (processing == 0) {
  +        } else if (processing == STATE_GROUND) {
               super.characters(ch, start, length);
           }
       }
   
  +    private void openWriter() throws IOException {
  +        File indexDirectory = new File(workDir, directoryName);
  +        // If the index directory doesn't exist, then always create it.
  +        boolean indexExists = IndexReader.indexExists(indexDirectory);
  +        if (! IndexReader.indexExists(indexDirectory)) 
  +            createIndex = true;
  +        
  +        // Get the index directory, creating it if necessary
  +        Directory directory = LuceneCocoonHelper.getDirectory(
  +            indexDirectory, 
  +            createIndex
  +        );
  +        Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname);
  +        this.writer = new IndexWriter(directory, analyzer, createIndex);
  +        this.writer.mergeFactor = mergeFactor; 
  +    }    
  +    
  +    private IndexReader openReader() throws IOException {
  +        Directory directory = LuceneCocoonHelper.getDirectory(
  +            new File(workDir, directoryName), 
  +            createIndex
  +        );
  +        IndexReader reader = IndexReader.open(directory);
  +        return reader;
  +    }    
  +
  +     private void reindexDocument() throws IOException {
  +        if (this.createIndex) {
  +            // The index is being created, so there's no need to delete the doc from an existing index.
  +            // This means we can keep a single IndexWriter open throughout the process.
  +            if (this.writer == null)
  +                openWriter();
  +            this.writer.addDocument(this.bodyDocument);
  +        } else {
  +            // This is an incremental reindex, so the document should be removed from the index before adding it
  +            try {
  +                IndexReader reader = openReader();
  +                reader.delete(new Term(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL)));
  +                reader.close();
  +            } catch (IOException e) { /* ignore */ }
  +            openWriter();
  +            this.writer.addDocument(this.bodyDocument);
  +            this.writer.close();
  +            this.writer = null;
  +        }
  +        this.bodyDocument = null;
  +     }
   
       class IndexHelperField
       {
  @@ -427,5 +518,5 @@
           public void append(char[] str, int offset, int length) {
               this.text.append(str, offset, length);
           }
  -    }
  +    }   
   }