You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by un...@apache.org on 2004/08/03 17:01:47 UTC
cvs commit: jakarta-slide/src/stores/org/apache/slide/index XMLContentIndexer.java TextContentIndexer.java
unico 2004/08/03 08:01:47
Modified: src/stores/org/apache/slide/index TextContentIndexer.java
Added: src/stores/org/apache/slide/index XMLContentIndexer.java
Log:
- allow subclasses to overide read content behavior
- add option to configure resource paths to be indexed
- add some javadoc
- add XMLContentIndexer that indexes only the character data of an xml document
Revision Changes Path
1.5 +64 -29 jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java
Index: TextContentIndexer.java
===================================================================
RCS file: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -r1.4 -r1.5
--- TextContentIndexer.java 20 Jul 2004 07:38:21 -0000 1.4
+++ TextContentIndexer.java 3 Aug 2004 15:01:46 -0000 1.5
@@ -45,21 +45,27 @@
import java.io.CharArrayReader;
import java.io.ByteArrayInputStream;
import java.io.Reader;
+import java.util.ArrayList;
+import java.util.Collection;
import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.StringTokenizer;
/**
- * Author: Ryan Rhodes
- * Date: Jun 24, 2004
- * Time: 10:34:45 PM
+ * Lucene based IndexStore for indexing content.
+ * Apart from indexing the content as text field it adds
+ * indexes using the registered content extractor.
*/
public class TextContentIndexer extends XAServiceBase implements IndexStore {
private static final String INDEX_PATH = "indexpath";
-
+ private static final String INCLUDES = "includes";
+
public static final String URI_FIELD = "uri";
public static final String CONTENT_TEXT = "content";
private String indexpath = "";
+ private Collection includes;
private boolean started = false;
/**
@@ -83,15 +89,15 @@
{
try
{
- // create index
+ // create index
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), true);
}
catch (IOException ex)
{
getLogger().log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
- throw new ServiceInitializationFailedException(this, ex);
- }
- }
+ throw new ServiceInitializationFailedException(this, ex);
+ }
+ }
try
{
@@ -117,7 +123,8 @@
NodeRevisionContent revisionContent)
throws IndexException
{
- IndexWriter indexWriter = null;
+ if (!isIncluded(uri.toString())) return;
+ IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false);
@@ -126,8 +133,7 @@
Document doc = new Document();
doc.add(Field.Keyword(URI_FIELD, uri.toString()));
- doc.add(Field.Text(CONTENT_TEXT,
- new CharArrayReader (revisionContent.getContent())));
+ doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
if ( revisionContent != null && revisionDescriptor != null ) {
ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor);
@@ -158,8 +164,8 @@
"Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
- }
- finally
+ }
+ finally
{
try
{
@@ -185,6 +191,7 @@
NodeRevisionContent revisionContent)
throws IndexException
{
+ if (!isIncluded(uri.toString())) return;
IndexWriter indexWriter = null;
try
{
@@ -201,8 +208,7 @@
Document doc = new Document();
doc.add(Field.Keyword(URI_FIELD, uri.toString()));
- doc.add(Field.Text(CONTENT_TEXT,
- new CharArrayReader (revisionContent.getContent())));
+ doc.add(Field.Text(CONTENT_TEXT, readContent(revisionDescriptor, revisionContent)));
if ( revisionContent != null && revisionDescriptor != null ) {
ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor);
@@ -256,6 +262,7 @@
synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
throws IndexException
{
+ if (!isIncluded(uri.toString())) return;
if (number == NodeRevisionNumber.HIDDEN_0_0) return;
IndexWriter indexWriter = null;
@@ -282,14 +289,14 @@
getLogger().log("Impossible to delete " + uri + " - " + number + " from the Lucene index");
}
finally
- {
- try
- {
- if(indexWriter != null)
- indexWriter.close();
- }
- catch(IOException ioe ) {}
- }
+ {
+ try
+ {
+ if(indexWriter != null)
+ indexWriter.close();
+ }
+ catch(IOException ioe ) {}
+ }
}
@@ -331,8 +338,12 @@
}
/**
- * Parametrize the service. This index store expects one parameter
+ * Parametrize the service. This index store expects a parameter
* "indexpath" to contain the path to the directory to store the index.
+ * Another optional parameter "includes" lists the paths of resources
+ * that are to be indexed in a comma-separated format.
+ * Everything under an included path is indexed. If not specified all
+ * resources will be indexed.
*
* @param parameters Hashtable containing the parameters' names
* and associated values
@@ -342,8 +353,17 @@
public void setParameters (Hashtable parameters) throws ServiceParameterErrorException, ServiceParameterMissingException
{
indexpath = (String)parameters.get (INDEX_PATH);
- if (indexpath == null || indexpath.length() == 0)
- throw new ServiceParameterMissingException (this, INDEX_PATH);
+ if (indexpath == null || indexpath.length() == 0) {
+ throw new ServiceParameterMissingException (this, INDEX_PATH);
+ }
+ String includes = (String) parameters.get(INCLUDES);
+ if (includes != null && includes.length() > 0) {
+ StringTokenizer tokenizer = new StringTokenizer(includes, ",");
+ this.includes = new ArrayList(tokenizer.countTokens());
+ while (tokenizer.hasMoreTokens()) {
+ this.includes.add(tokenizer.nextToken());
+ }
+ }
}
/**
@@ -373,4 +393,19 @@
Logger.INFO);
}
+ protected Reader readContent(NodeRevisionDescriptor revisionDescriptor,
+ NodeRevisionContent revisionContent) throws IOException {
+ return new CharArrayReader (revisionContent.getContent());
+ }
+
+ protected boolean isIncluded(String uri) {
+ if (includes == null) return true;
+ Iterator iter = includes.iterator();
+ while (iter.hasNext()) {
+ if (uri.startsWith((String) iter.next())) {
+ return true;
+ }
+ }
+ return false;
+ }
}
1.1 jakarta-slide/src/stores/org/apache/slide/index/XMLContentIndexer.java
Index: XMLContentIndexer.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/XMLContentIndexer.java,v 1.1 2004/08/03 15:01:46 unico Exp $
* $Revision: 1.1 $
* $Date: 2004/08/03 15:01:46 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.index;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.slide.common.NamespaceAccessToken;
import org.apache.slide.common.ServiceInitializationFailedException;
import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.util.logger.Logger;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Extends TextContentIndexer for handling XML content
* by only indexing the actual character data.
*/
public class XMLContentIndexer extends TextContentIndexer {
private SAXParser m_parser;
public void initialize(NamespaceAccessToken token) throws ServiceInitializationFailedException {
super.initialize(token);
try {
m_parser = SAXParserFactory.newInstance().newSAXParser();
}
catch (ParserConfigurationException e) {
getLogger().log("Error creating parser for indexer", LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, e);
}
catch (SAXException e) {
getLogger().log("Error creating parser for indexer", LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, e);
}
}
protected synchronized Reader readContent(NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent) throws IOException {
if (revisionDescriptor.getContentType().equals("text/xml")) {
try {
final XMLContentIndexerHandler handler = new XMLContentIndexerHandler();
m_parser.parse(new ByteArrayInputStream(revisionContent.getContentBytes()), handler);
return new StringReader(handler.getText());
} catch (SAXException e) {
getLogger().log("Error parsing xml content for indexer", LOG_CHANNEL, Logger.ERROR);
}
}
return super.readContent(revisionDescriptor, revisionContent);
}
private static final class XMLContentIndexerHandler extends DefaultHandler {
private final StringBuffer m_text = new StringBuffer();
public void characters(char[] ch, int start, int length) throws SAXException {
m_text.append(ch, start, length);
}
public void endElement(String uri, String localName, String qName) throws SAXException {
super.endElement(uri, localName, qName);
m_text.append(' ');
}
public String getText() {
return m_text.toString();
}
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: slide-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: slide-dev-help@jakarta.apache.org