You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by un...@apache.org on 2004/08/03 17:01:47 UTC

cvs commit: jakarta-slide/src/stores/org/apache/slide/index XMLContentIndexer.java TextContentIndexer.java

unico       2004/08/03 08:01:47

  Modified:    src/stores/org/apache/slide/index TextContentIndexer.java
  Added:       src/stores/org/apache/slide/index XMLContentIndexer.java
  Log:
  - allow subclasses to overide read content behavior
  - add option to configure resource paths to be indexed
  - add some javadoc
  - add XMLContentIndexer that indexes only the character data of an xml document
  
  Revision  Changes    Path
  1.5       +64 -29    jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java
  
  Index: TextContentIndexer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- TextContentIndexer.java	20 Jul 2004 07:38:21 -0000	1.4
  +++ TextContentIndexer.java	3 Aug 2004 15:01:46 -0000	1.5
  @@ -45,21 +45,27 @@
   import java.io.CharArrayReader;
   import java.io.ByteArrayInputStream;
   import java.io.Reader;
  +import java.util.ArrayList;
  +import java.util.Collection;
   import java.util.Hashtable;
  +import java.util.Iterator;
  +import java.util.StringTokenizer;
   
   /**
  - * Author: Ryan Rhodes
  - * Date: Jun 24, 2004
  - * Time: 10:34:45 PM
  + * Lucene based IndexStore for indexing content. 
  + * Apart from indexing the content as text field it adds
  + * indexes using the registered content extractor.
    */
   public class TextContentIndexer extends XAServiceBase implements IndexStore {
   
       private static final String INDEX_PATH = "indexpath";
  -
  +    private static final String INCLUDES = "includes";
  +    
       public static final String URI_FIELD = "uri";
       public static final String CONTENT_TEXT = "content";
   
       private String indexpath = "";
  +    private Collection includes;
       private boolean started = false;
   
     /**
  @@ -83,15 +89,15 @@
         {
            try
            {
  -         // create index
  +            // create index
               indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), true);
            }
            catch (IOException ex)
            {
               getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
  -               throw new ServiceInitializationFailedException(this, ex);
  -            }
  -        }
  +            throw new ServiceInitializationFailedException(this, ex);
  +         }
  +      }
   
         try
         {
  @@ -117,7 +123,8 @@
                                             NodeRevisionContent revisionContent)
           throws IndexException
       {
  -        IndexWriter indexWriter = null;
  +      if (!isIncluded(uri.toString())) return;
  +      IndexWriter indexWriter = null;
         try
         {
            indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false);
  @@ -126,8 +133,7 @@
            Document doc = new Document();
   
            doc.add(Field.Keyword(URI_FIELD, uri.toString()));
  -         doc.add(Field.Text(CONTENT_TEXT,
  -            new CharArrayReader (revisionContent.getContent())));
  +         doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
   
            if ( revisionContent != null && revisionDescriptor != null ) {
               ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor);
  @@ -158,8 +164,8 @@
               "Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
               LOG_CHANNEL,
               Logger.ERROR);
  -        }
  -       finally
  +      }
  +      finally
         {
             try
             {
  @@ -185,6 +191,7 @@
                                            NodeRevisionContent revisionContent)
         throws IndexException
       {
  +        if (!isIncluded(uri.toString())) return;
           IndexWriter indexWriter = null;
           try
           {
  @@ -201,8 +208,7 @@
               Document doc = new Document();
   
               doc.add(Field.Keyword(URI_FIELD, uri.toString()));
  -            doc.add(Field.Text(CONTENT_TEXT,
  -                 new CharArrayReader (revisionContent.getContent())));
  +            doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
   
               if ( revisionContent != null && revisionDescriptor != null ) {
                    ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor);
  @@ -256,6 +262,7 @@
       synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
         throws IndexException
       {
  +        if (!isIncluded(uri.toString())) return;
           if (number == NodeRevisionNumber.HIDDEN_0_0) return;
   
           IndexWriter indexWriter = null;
  @@ -282,14 +289,14 @@
               getLogger().log("Impossible to delete " + uri + " - " + number + " from the Lucene index");
           }
           finally
  -       {
  -           try
  -           {
  -               if(indexWriter != null)
  -                  indexWriter.close();
  -           }
  -           catch(IOException ioe )  {}
  -       }
  +        {
  +            try
  +            {
  +                if(indexWriter != null)
  +                   indexWriter.close();
  +            }
  +            catch(IOException ioe )  {}
  +        }
       }
   
   
  @@ -331,8 +338,12 @@
      }
   
       /**
  -     * Parametrize the service. This index store expects one parameter
  +     * Parametrize the service. This index store expects a parameter
        * "indexpath" to contain the path to the directory to store the index.
  +     * Another optional parameter "includes" lists the paths of resources 
  +     * that are to be indexed in a comma-separated format. 
  +     * Everything under an included path is indexed. If not specified all 
  +     * resources will be indexed.
        * 
        * @param parameters Hashtable containing the parameters' names
        * and associated values
  @@ -342,8 +353,17 @@
      public void setParameters (Hashtable parameters) throws ServiceParameterErrorException, ServiceParameterMissingException
      {
           indexpath = (String)parameters.get (INDEX_PATH);
  -        if (indexpath == null || indexpath.length() == 0)
  -          throw new ServiceParameterMissingException (this, INDEX_PATH);
  +        if (indexpath == null || indexpath.length() == 0) {
  +            throw new ServiceParameterMissingException (this, INDEX_PATH);
  +        }
  +        String includes = (String) parameters.get(INCLUDES);
  +        if (includes != null && includes.length() > 0) {
  +            StringTokenizer tokenizer = new StringTokenizer(includes, ",");
  +            this.includes = new ArrayList(tokenizer.countTokens());
  +            while (tokenizer.hasMoreTokens()) {
  +                this.includes.add(tokenizer.nextToken());
  +            }
  +        }
      }
   
       /**
  @@ -373,4 +393,19 @@
                Logger.INFO);
      }
   
  +    protected Reader readContent(NodeRevisionDescriptor revisionDescriptor, 
  +                                 NodeRevisionContent revisionContent) throws IOException {
  +        return new CharArrayReader (revisionContent.getContent());
  +    }
  +    
  +    protected boolean isIncluded(String uri) {
  +        if (includes == null) return true;
  +        Iterator iter = includes.iterator();
  +        while (iter.hasNext()) {
  +            if (uri.startsWith((String) iter.next())) {
  +                return true;
  +            }
  +        }
  +        return false;
  +    }
   }
  
  
  
  1.1                  jakarta-slide/src/stores/org/apache/slide/index/XMLContentIndexer.java
  
  Index: XMLContentIndexer.java
  ===================================================================
  /*
   * $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/XMLContentIndexer.java,v 1.1 2004/08/03 15:01:46 unico Exp $
   * $Revision: 1.1 $
   * $Date: 2004/08/03 15:01:46 $
   *
   * ====================================================================
   *
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   *
   */
  package org.apache.slide.index;
  
  import java.io.ByteArrayInputStream;
  import java.io.IOException;
  import java.io.Reader;
  import java.io.StringReader;
  
  import javax.xml.parsers.ParserConfigurationException;
  import javax.xml.parsers.SAXParser;
  import javax.xml.parsers.SAXParserFactory;
  
  import org.apache.slide.common.NamespaceAccessToken;
  import org.apache.slide.common.ServiceInitializationFailedException;
  import org.apache.slide.content.NodeRevisionContent;
  import org.apache.slide.content.NodeRevisionDescriptor;
  import org.apache.slide.util.logger.Logger;
  import org.xml.sax.SAXException;
  import org.xml.sax.helpers.DefaultHandler;
  
  /**
   * Extends TextContentIndexer for handling XML content
   * by only indexing the actual character data.
   */
  public class XMLContentIndexer extends TextContentIndexer {
      
      private SAXParser m_parser;
      
      public void initialize(NamespaceAccessToken token) throws ServiceInitializationFailedException {
          super.initialize(token);
          try {
              m_parser = SAXParserFactory.newInstance().newSAXParser();            
          }
          catch (ParserConfigurationException e) {
              getLogger().log("Error creating parser for indexer", LOG_CHANNEL, Logger.ERROR);
              throw new ServiceInitializationFailedException(this, e);
          }
          catch (SAXException e) {
              getLogger().log("Error creating parser for indexer", LOG_CHANNEL, Logger.ERROR);
              throw new ServiceInitializationFailedException(this, e);
          }
      }
  
      protected synchronized Reader readContent(NodeRevisionDescriptor revisionDescriptor, 
                                   NodeRevisionContent revisionContent) throws IOException {
          if (revisionDescriptor.getContentType().equals("text/xml")) {
              try {
                  final XMLContentIndexerHandler handler = new XMLContentIndexerHandler();
                  m_parser.parse(new ByteArrayInputStream(revisionContent.getContentBytes()), handler);
                  return new StringReader(handler.getText());
              } catch (SAXException e) {
                  getLogger().log("Error parsing xml content for indexer", LOG_CHANNEL, Logger.ERROR);
              }
          }
          return super.readContent(revisionDescriptor, revisionContent);
      }
      
      private static final class XMLContentIndexerHandler extends DefaultHandler {
  
          private final StringBuffer m_text = new StringBuffer();
  
          public void characters(char[] ch, int start, int length) throws SAXException {
              m_text.append(ch, start, length);
          } 
  
          public void endElement(String uri, String localName, String qName) throws SAXException {
              super.endElement(uri, localName, qName);
              m_text.append(' ');
          }
  
          public String getText() {
              return m_text.toString();
          }
  
      }
  
      
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: slide-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: slide-dev-help@jakarta.apache.org