You are viewing a plain text version of this content. The canonical link for it is here.
Posted to slide-dev@jakarta.apache.org by oz...@apache.org on 2004/06/29 10:10:58 UTC
cvs commit: jakarta-slide/src/conf/webapp Extractor-Domain.xml
ozeigermann 2004/06/29 01:10:58
Added: src/share/org/apache/slide/extractor MSWordExtractor.java
MSExcelExtractor.java MSPowerPointExtractor.java
PDFExtractor.java
lib tm-extractors-0.4.jar PDFBox-0.6.5.jar
src/stores/org/apache/slide/index TextContentIndexer.java
TextContainsExpression.java
TextContainsExpressionFactory.java
src/conf/webapp Extractor-Domain.xml
Log:
Added extractor classes donated by Ryan Rhodes as described in
http://issues.apache.org/bugzilla/show_bug.cgi?id=29842
Revision Changes Path
1.1 jakarta-slide/src/share/org/apache/slide/extractor/MSWordExtractor.java
Index: MSWordExtractor.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/MSWordExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:57 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.extractor;
/**
* Author: Ryan Rhodes
* Date: Jun 26, 2004
* Time: 12:34:29 AM
*/
import java.io.*;
import org.textmining.text.extraction.WordExtractor;
public class MSWordExtractor extends AbstractContentExtractor {
public MSWordExtractor(String uri, String contentType) {
super(uri, contentType);
}
public Reader extract(InputStream content) throws ExtractorException {
try {
WordExtractor extractor =
new WordExtractor();
String text = extractor.extractText(content);
StringReader reader = new StringReader(text);
return reader;
}
catch(Exception e) {
throw new ExtractorException(e.getMessage());
}
}
public static void main(String[] args) throws Exception
{
FileInputStream in = new FileInputStream(args[0]);
MSWordExtractor ex = new MSWordExtractor(null, null);
Reader reader = ex.extract(in);
int c;
do
{
c = reader.read();
System.out.print((char)c);
}
while( c != -1 );
}
}
1.1 jakarta-slide/src/share/org/apache/slide/extractor/MSExcelExtractor.java
Index: MSExcelExtractor.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/MSExcelExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:57 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.extractor;
/**
* Author: Ryan Rhodes
* Date: Jun 26, 2004
* Time: 1:53:31 AM
*/
import java.io.*;
import java.util.Iterator;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFCell;
public class MSExcelExtractor extends AbstractContentExtractor
{
public MSExcelExtractor(String uri, String contentType) {
super(uri, contentType);
}
public Reader extract(InputStream content) throws ExtractorException
{
try
{
CharArrayWriter writer = new CharArrayWriter();
POIFSFileSystem fs = new POIFSFileSystem(content);
HSSFWorkbook workbook = new HSSFWorkbook(fs);
for (int i = 0; i < workbook.getNumberOfSheets(); i++ )
{
HSSFSheet sheet = workbook.getSheetAt(i);
Iterator rows = sheet.rowIterator();
while( rows.hasNext() ) {
HSSFRow row = (HSSFRow) rows.next();
Iterator cells = row.cellIterator();
while( cells.hasNext() ) {
HSSFCell cell = (HSSFCell) cells.next();
switch ( cell.getCellType() ) {
case HSSFCell.CELL_TYPE_NUMERIC:
String num = Double.toString(cell.getNumericCellValue()).trim();
if(num.length() > 0)
writer.write(num + " ");
break;
case HSSFCell.CELL_TYPE_STRING:
String text = cell.getStringCellValue().trim();
if(text.length() > 0)
writer.write(text + " ");
break;
}
}
}
}
return new CharArrayReader(writer.toCharArray());
}
catch(Exception e )
{
throw new ExtractorException(e.getMessage());
}
}
public static void main(String[] args) throws Exception
{
FileInputStream in = new FileInputStream(args[0]);
MSExcelExtractor ex = new MSExcelExtractor(null, null);
Reader reader = ex.extract(in);
int c = 0;
do
{
c = reader.read();
System.out.print((char)c);
}
while(c != -1);
}
}
1.1 jakarta-slide/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java
Index: MSPowerPointExtractor.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:57 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.extractor;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import java.io.*;
/**
* Author: Ryan Rhodes
* Date: Jun 27, 2004
* Time: 3:45:39 AM
*/
public class MSPowerPointExtractor extends AbstractContentExtractor implements POIFSReaderListener
{
private ByteArrayOutputStream writer = new ByteArrayOutputStream();
public MSPowerPointExtractor(String uri, String contentType) {
super(uri, contentType);
}
public Reader extract(InputStream content) throws ExtractorException {
try {
POIFSReader reader = new POIFSReader();
reader.registerListener(this);
reader.read(content);
return new InputStreamReader(new ByteArrayInputStream(writer.toByteArray()));
}
catch(Exception e) {
throw new ExtractorException(e.getMessage());
}
}
public void processPOIFSReaderEvent(POIFSReaderEvent event)
{
try{
if(!event.getName().equalsIgnoreCase("PowerPoint Document"))
return;
DocumentInputStream input = event.getStream();
byte[] buffer = new byte[input.available()];
input.read(buffer, 0, input.available());
for(int i=0; i<buffer.length-20; i++)
{
long type = LittleEndian.getUShort(buffer,i+2);
long size = LittleEndian.getUInt(buffer,i+4);
if(type==4008)
{
writer.write(buffer, i + 4 + 1, (int) size);
i = i + 4 + 1 + (int) size - 1;
}
}
}
catch (Exception e)
{
}
}
public static void main(String[] args) throws Exception
{
FileInputStream in = new FileInputStream(args[0]);
MSPowerPointExtractor ex = new MSPowerPointExtractor(null, null);
Reader reader = ex.extract(in);
int c;
do
{
c = reader.read();
System.out.print((char)c);
}
while( c != -1 );
}
}
1.1 jakarta-slide/src/share/org/apache/slide/extractor/PDFExtractor.java
Index: PDFExtractor.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/PDFExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:57 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.extractor;
import org.pdfbox.util.PDFTextStripper;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import java.io.*;
/**
* Author: Ryan Rhodes
* Date: Jun 26, 2004
* Time: 4:03:00 AM
*/
public class PDFExtractor extends AbstractContentExtractor
{
public PDFExtractor(String uri, String contentType)
{
super(uri, contentType);
}
public Reader extract(InputStream content) throws ExtractorException
{
try
{
PDFParser parser = new PDFParser( content );
parser.parse();
PDDocument document = parser.getPDDocument();
CharArrayWriter writer = new CharArrayWriter();
PDFTextStripper stripper = new PDFTextStripper();
stripper.setLineSeparator("\n");
stripper.writeText(document, writer);
document.close();
writer.close();
return new CharArrayReader(writer.toCharArray());
}
catch(Exception e )
{
throw new ExtractorException(e.getMessage());
}
}
public static void main(String[] args) throws Exception
{
FileInputStream in = new FileInputStream(args[0]);
PDFExtractor ex = new PDFExtractor(null, null);
Reader reader = ex.extract(in);
int c = 0;
do
{
c = reader.read();
System.out.print((char)c);
}
while(c != -1);
}
}
1.1 jakarta-slide/lib/tm-extractors-0.4.jar
<<Binary file>>
1.1 jakarta-slide/lib/PDFBox-0.6.5.jar
<<Binary file>>
1.1 jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java
Index: TextContentIndexer.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v 1.1 2004/06/29 08:10:58 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:58 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.index;
import org.apache.slide.search.IndexException;
import org.apache.slide.search.basic.IBasicExpressionFactory;
import org.apache.slide.util.logger.Logger;
import org.apache.slide.common.*;
import org.apache.slide.content.NodeRevisionNumber;
import org.apache.slide.content.NodeRevisionDescriptor;
import org.apache.slide.content.NodeRevisionContent;
import org.apache.slide.store.IndexStore;
import org.apache.slide.extractor.ExtractorManager;
import org.apache.slide.extractor.ExtractorException;
import org.apache.slide.extractor.ContentExtractor;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.IOException;
import java.io.CharArrayReader;
import java.io.ByteArrayInputStream;
import java.io.Reader;
import java.util.Hashtable;
/**
* Author: Ryan Rhodes
* Date: Jun 24, 2004
* Time: 10:34:45 PM
*/
public class TextContentIndexer extends AbstractService implements IndexStore {
private static final String INDEX_PATH = "indexpath";
public static final String URI_FIELD = "uri";
public static final String CONTENT_TEXT = "content";
private String indexpath = "";
private boolean started = false;
/**
* Create Index, if not yet done.
*
* @param token a NamespaceAccessToken
*
* @throws org.apache.slide.common.ServiceInitializationFailedException
*
*/
public void initialize(NamespaceAccessToken token)
throws ServiceInitializationFailedException
{
IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false);
}
// will fail, if not yet exists
catch (IOException e)
{
try
{
// create index
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), true);
}
catch (IOException ex)
{
Domain.log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException(this, ex);
}
}
try
{
indexWriter.close();
}
catch (IOException e)
{
Domain.log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR);
throw new ServiceInitializationFailedException (this, e);
}
Domain.log("Lucene is correctly initialized", LOG_CHANNEL, Logger.INFO);
}
/**
* Index an object content.
*
* @param uri Uri
* @exception IndexException Error accessing the Data Source
*/
synchronized public void createIndex (Uri uri,
NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent)
throws IndexException
{
IndexWriter indexWriter = null;
try
{
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false);
// Create document
Document doc = new Document();
doc.add(Field.UnIndexed(URI_FIELD, uri.toString()));
doc.add(Field.Text(CONTENT_TEXT,
new CharArrayReader (revisionContent.getContent())));
if ( revisionContent != null && revisionDescriptor != null ) {
ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor);
for ( int i = 0; i < extractor.length; i++ ) {
Reader reader = extractor[i].extract(new ByteArrayInputStream(revisionContent.getContentBytes()));
doc.add(Field.Text(CONTENT_TEXT, reader));
}
}
indexWriter.addDocument(doc);
indexWriter.optimize();
Domain.log(
"Added '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index",
LOG_CHANNEL,
Logger.INFO);
}
catch (IOException e)
{
Domain.log(
"Error creating an index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
catch( ExtractorException e)
{
Domain.log(
"Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Method updateIndex
*
* @param uri an Uri
* @param revisionDescriptor a NodeRevisionDescriptor
* @param revisionContent a NodeRevisionContent
*
* @throws IndexException
*
*/
synchronized public void updateIndex(Uri uri,
NodeRevisionDescriptor revisionDescriptor,
NodeRevisionContent revisionContent)
throws IndexException
{
IndexWriter indexWriter = null;
try
{
// Delete entries from index
IndexReader indexReader = IndexReader.open(indexpath);
Term term = new Term(URI_FIELD, uri.toString());
indexReader.delete(term);
indexReader.close();
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false);
// Create document
Document doc = new Document();
doc.add(Field.UnIndexed(URI_FIELD, uri.toString()));
doc.add(Field.Text(CONTENT_TEXT,
new CharArrayReader (revisionContent.getContent())));
if ( revisionContent != null && revisionDescriptor != null ) {
ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor);
for ( int i = 0; i < extractor.length; i++ ) {
Reader reader = extractor[i].extract(new ByteArrayInputStream(revisionContent.getContentBytes()));
doc.add(Field.Text(CONTENT_TEXT, reader));
}
}
indexWriter.addDocument(doc);
indexWriter.optimize();
Domain.log(
"Updated '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index",
LOG_CHANNEL,
Logger.INFO);
}
catch (IOException e)
{
Domain.log(
"Error updating the index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
catch( ExtractorException e)
{
Domain.log(
"Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(),
LOG_CHANNEL,
Logger.ERROR);
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Drop an object revision from the index.
*
* @param uri Uri
* @exception IndexException
*/
synchronized public void dropIndex(Uri uri, NodeRevisionNumber number)
throws IndexException
{
IndexWriter indexWriter = null;
try
{
IndexReader indexReader = IndexReader.open(indexpath);
Term term = new Term(URI_FIELD, uri.toString());
indexReader.delete(term);
indexReader.close();
indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false);
indexWriter.optimize();
Domain.log(
"Deleted '" + uri.toString() + "' from the index",
LOG_CHANNEL,
Logger.INFO);
}
catch (IOException e)
{
Domain.log("Impossible to delete " + uri.toString() + " - " + number.toString() + " from the Lucene index");
}
finally
{
try
{
if(indexWriter != null)
indexWriter.close();
}
catch(IOException ioe ) {}
}
}
/**
* Method getFactory
*
* @return an IBasicExpressionFactory
*
*/
public IBasicExpressionFactory getBasicExpressionFactory()
{
return new TextContainsExpressionFactory(indexpath);
}
/**
* Connects to the underlying data source (if any is needed).
*
* @exception ServiceConnectionFailedException Connection failed
*/
public void connect() throws ServiceConnectionFailedException
{
Domain.log(
"TextContentIndexer: connect",
LOG_CHANNEL,
Logger.INFO);
started = true;
}
/**
* This function tells whether or not the service is connected.
*
* @return boolean true if we are connected
* @exception ServiceAccessException Service access error
*/
public boolean isConnected() throws ServiceAccessException
{
return started;
}
/**
* Initializes the service with a set of parameters. Those could be :
* <li>User name, login info
* <li>Host name on which to connect
* <li>Remote port
* <li>JDBC driver whoich is to be used :-)
* <li>Anything else ...
*
* @param parameters Hashtable containing the parameters' names
* and associated values
* @exception ServiceParameterErrorException Incorrect service parameter
* @exception ServiceParameterMissingException Service parameter missing
*/
public void setParameters (Hashtable parameters) throws ServiceParameterErrorException, ServiceParameterMissingException
{
indexpath = (String)parameters.get (INDEX_PATH);
if (indexpath == null || indexpath.length() == 0)
throw new ServiceParameterMissingException (this, INDEX_PATH);
}
/**
* Disconnects from the underlying data source.
*
* @exception ServiceDisconnectionFailedException Disconnection failed
*/
public void disconnect() throws ServiceDisconnectionFailedException
{
Domain.log(
"TextContentIndexer: disconnect",
LOG_CHANNEL,
Logger.INFO);
started = false;
}
/**
* Deletes service underlying data source, if possible (and meaningful).
*
* @exception ServiceResetFailedException Reset failed
*/
public void reset() throws ServiceResetFailedException
{
Domain.log(
"TextContentIndexer: reset",
LOG_CHANNEL,
Logger.INFO);
}
}
1.1 jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpression.java
Index: TextContainsExpression.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpression.java,v 1.1 2004/06/29 08:10:58 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:58 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.index;
import org.apache.slide.search.basic.*;
import org.apache.slide.search.BadQueryException;
import org.apache.slide.search.SearchException;
import org.apache.slide.search.RequestedResource;
import org.apache.slide.structure.ObjectNode;
import org.apache.slide.structure.SubjectNode;
import org.apache.slide.common.SlideException;
import org.apache.slide.common.Domain;
import org.apache.slide.util.logger.Logger;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.document.Document;
import java.util.Collection;
/**
* Author: Ryan Rhodes
* Date: Jun 24, 2004
* Time: 11:45:30 PM
*/
public class TextContainsExpression implements IBasicExpression {
protected static final String LOG_CHANNEL = TextContainsExpression.class.getName();
String searchedText;
String indexPath;
/** backptr to the factory */
IBasicExpressionFactory factory;
/**
* constructor for a compare expression like gt, eq, ...
* For your concrete implementation you are free, which parameters have to
* be passed, let the factory give you everything you need.
*/
TextContainsExpression (String searchedText, String rootPath)
{
this.searchedText = searchedText;
this.indexPath = rootPath;
}
/**
* constructor for a merge expression
*/
TextContainsExpression (String mergeOperator,
Collection children,
IBasicExpressionFactory factory)
throws BadQueryException
{
// this.factory = factory;
// Iterator it = children.iterator();
// BasicExpressionTxtContainsSample firstChild = (BasicExpressionTxtContainsSample)it.next();
//
// if (firstChild == null)
// throw new BadQueryException (mergeOperator + " needs at least one nested element");
//
// theExecutableCommand = firstChild.theExecutableCommand;
//
// // create the executable command
// while (it.hasNext()) {
// BasicExpressionTxtContainsSample exp = (BasicExpressionTxtContainsSample)it.next();
// theExecutableCommand += " " + mergeOperator + " " + exp.theExecutableCommand;
// }
}
/**
* Search the index for this expression using Lucene.
*
* @return an IBasicResultSet
*
* @throws org.apache.slide.search.SearchException
*
*/
public IBasicResultSet execute() throws SearchException
{
IBasicResultSet result = new BasicResultSetImpl (false);
try
{
Searcher searcher = new IndexSearcher(indexPath);
Analyzer analyzer = new StandardAnalyzer();
Query query = QueryParser.parse(searchedText, TextContentIndexer.CONTENT_TEXT, analyzer);
Hits hits = searcher.search (query);
int noOfHits = hits.length();
for (int i = 0; i < noOfHits; i++)
{
Document doc = hits.doc(i);
String uri = doc.get(TextContentIndexer.URI_FIELD);
RequestedResource resource = createResource(uri);
result.add (resource);
}
}
catch (Exception e)
{
throw new SearchException (e);
}
Domain.log(
"Executed Search for '" + searchedText + "' in the index",
LOG_CHANNEL,
Logger.INFO);
return result;
}
private RequestedResource createResource(String uri) throws SearchException
{
ObjectNode node = new SubjectNode(uri); // this will return the root folder
RequestedResource resource = null;
IBasicQuery query = factory.getQuery();
try
{
resource = new ComparableResourceImpl
(node, query.getSearchToken(), query.getScope(),
factory.getPropertyProvider());
}
catch (SlideException e)
{
throw new SearchException (e);
}
return resource;
}
public void setFactory (IBasicExpressionFactory factory)
{
this.factory = factory;
}
public IBasicExpressionFactory getFactory()
{
return this.factory;
}
}
1.1 jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpressionFactory.java
Index: TextContainsExpressionFactory.java
===================================================================
/*
* $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpressionFactory.java,v 1.1 2004/06/29 08:10:58 ozeigermann Exp $
* $Revision: 1.1 $
* $Date: 2004/06/29 08:10:58 $
*
* ====================================================================
*
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.slide.index;
import org.apache.slide.search.basic.IBasicExpressionFactory;
import org.apache.slide.search.basic.IBasicQuery;
import org.apache.slide.search.basic.IBasicExpression;
import org.apache.slide.search.PropertyProvider;
import org.apache.slide.search.BadQueryException;
import org.apache.slide.content.NodeProperty;
import org.jdom.Element;
import java.util.Collection;
/**
* Author: Ryan Rhodes
* Date: Jun 24, 2004
* Time: 11:42:35 PM
*/
public class TextContainsExpressionFactory implements IBasicExpressionFactory {
private IBasicQuery query;
protected PropertyProvider propertyProvider;
private String rootPath;
/**
* Constructor
*
* @param rootPath path to the content files
*
*/
public TextContainsExpressionFactory (String rootPath)
{
this.rootPath = rootPath;
}
/**
* called for merge expressions (or, and). Not defined here
*
* @param mergeOperator and, or
* @param namespace the namespace of this expression
* @param expressionsToMerge all expressions, that shall be merged
*
* @return an IBasicExpression
*
* @throws BadQueryException
*
*/
public IBasicExpression createMergeExpression (String mergeOperator,
String namespace,
Collection expressionsToMerge)
throws BadQueryException
{
return null;
}
/**
* Called by the expression compiler for each leave expression.
*
* @param element an Element discribing the expression
*
* @return an IBasicExpression
*
* @throws BadQueryException
*
*/
public IBasicExpression createExpression (Element element)
throws BadQueryException
{
TextContainsExpression result = null;
if (element == null)
{
throw new BadQueryException ("expected a where criteria");
}
else
{
String namespace = element.getNamespace().getURI();
if (namespace.equals (NodeProperty.NamespaceCache.DEFAULT_URI))
result = createDAVExpression (element);
// allow store specific extensions
// else if (namespace.equals (MyNamespace))
// result = createMyExpression (element);
}
result.setFactory(this);
return result;
}
/**
* Called, when the expression is in the default (DAV:) namespace.
*
*
* @param e an Element
*
* @return a BasicExpressionTemplate
*
*/
private TextContainsExpression createDAVExpression (Element e)
{
String name = e.getName();
TextContainsExpression result = null;
if (name.equals ("contains"))
{
String searchedText = e.getTextTrim();
result = new TextContainsExpression (searchedText, rootPath);
}
return result;
}
/**
* called by BasicExpressionCompiler after construction.
*
* @param query the associated BasicQuery
* @param propertyProvider the PropertyProvider for this expression.
*
* @throws BadQueryException
*
*/
public void init(IBasicQuery query, PropertyProvider propertyProvider)
throws BadQueryException
{
this.query = (IBasicQuery) query;
this.propertyProvider = propertyProvider;
}
/**
* Method getPropertyProvider
*
* @return the PropertyProvider
*
*/
public PropertyProvider getPropertyProvider()
{
return propertyProvider;
}
/**
* Method getQuery
*
* @return the IBasicQuery
*
*/
public IBasicQuery getQuery()
{
return query;
}
}
1.1 jakarta-slide/src/conf/webapp/Extractor-Domain.xml
Index: Extractor-Domain.xml
===================================================================
<?xml version="1.0"?>
<slide logger-level="6" default="slide">
<namespace name="slide">
<definition>
<store name="tx">
<parameter name="tlock-timeout">20</parameter>
<nodestore classname="org.apache.slide.store.txfile.TxXMLFileDescriptorsStore">
<parameter name="rootpath">store/metadata</parameter>
<parameter name="workpath">work/metadata</parameter>
<parameter name="defer-saving">true</parameter>
</nodestore>
<sequencestore classname="org.apache.slide.store.txfile.FileSequenceStore">
<parameter name="rootpath">store/sequence</parameter>
</sequencestore>
<securitystore>
<reference store="nodestore"/>
</securitystore>
<lockstore>
<reference store="nodestore"/>
</lockstore>
<revisiondescriptorsstore>
<reference store="nodestore"/>
</revisiondescriptorsstore>
<revisiondescriptorstore>
<reference store="nodestore"/>
</revisiondescriptorstore>
<contentstore classname="org.apache.slide.store.txfile.TxFileContentStore">
<parameter name="rootpath">store/content</parameter>
<parameter name="workpath">work/content</parameter>
<parameter name="defer-saving">true</parameter>
</contentstore>
<contentindexer classname="org.apache.slide.index.TextContentIndexer">
<parameter name="indexpath">store/index</parameter>
</contentindexer>
</store>
<scope match="/" store="tx"/>
</definition>
<configuration>
<!-- Actions mapping -->
<read-object>/actions/read</read-object>
<create-object>/actions/write</create-object>
<remove-object>/actions/write</remove-object>
<grant-permission>/actions/write-acl</grant-permission>
<revoke-permission>/actions/write-acl</revoke-permission>
<read-permissions>/actions/read-acl</read-permissions>
<read-own-permissions>/actions/read-current-user-privilege-set</read-own-permissions>
<lock-object>/actions/write</lock-object>
<kill-lock>/actions/unlock</kill-lock>
<read-locks>/actions/read</read-locks>
<read-revision-metadata>/actions/read</read-revision-metadata>
<create-revision-metadata>/actions/write-properties</create-revision-metadata>
<modify-revision-metadata>/actions/write-properties</modify-revision-metadata>
<remove-revision-metadata>/actions/write-properties</remove-revision-metadata>
<read-revision-content>/actions/read</read-revision-content>
<create-revision-content>/actions/write-content</create-revision-content>
<modify-revision-content>/actions/write-content</modify-revision-content>
<remove-revision-content>/actions/write-content</remove-revision-content>
<bind-member>/actions/bind</bind-member>
<unbind-member>/actions/unbind</unbind-member>
<!-- Paths configuration -->
<userspath>/users</userspath>
<rolespath>/roles</rolespath>
<actionspath>/actions</actionspath>
<filespath>/files</filespath>
<parameter name="dav">true</parameter>
<parameter name="standalone">true</parameter>
<parameter name="acl_inheritance_type">path</parameter>
<!-- Nested roles: 0 means no nesting (default), 1 means one sublevel, etc. -->
<parameter name="nested_roles_maxdepth">0</parameter>
</configuration>
<data>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/">
<!-- Subject can be:
any user "all"
authenticated user "authenticated"
unauthenticated user "unauthenticated"
self "self"
owner of resource "owner"
a user "/users/john"
a role "/roles/admin"
-->
<permission action="all" subject="/roles/root" inheritable="true"/>
<permission action="/actions/read-acl" subject="all" inheritable="true" negative="true"/>
<permission action="/actions/write-acl" subject="all" inheritable="true" negative="true"/>
<permission action="/actions/unlock" subject="all" inheritable="true" negative="true"/>
<permission action="/actions/read" subject="all" inheritable="true"/>
<!-- /users -->
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users">
<permission action="all" subject="self" inheritable="true"/>
<permission action="all" subject="unauthenticated" inheritable="true" negative="true"/>
<!-- /users/root represents the administrator -->
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/root">
<revision>
<property namespace="http://jakarta.apache.org/slide/" name="password">root</property>
</revision>
</objectnode>
<!-- /users/john and /users/john2 represent authenticated users -->
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/john">
<revision>
<property namespace="http://jakarta.apache.org/slide/" name="password">john</property>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/john2">
<revision>
<property namespace="http://jakarta.apache.org/slide/" name="password">john2</property>
</revision>
</objectnode>
<!-- /users/guest represents an authenticated or unauthenticated guest user -->
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/guest">
<revision>
<property namespace="http://jakarta.apache.org/slide/" name="password">guest</property>
</revision>
</objectnode>
</objectnode>
<!-- /roles -->
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles">
<permission action="all" subject="self" inheritable="true"/>
<permission action="all" subject="unauthenticated" inheritable="true" negative="true"/>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles/root">
<revision>
<property name="group-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/users/root</D:href>]]></property>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles/user">
<revision>
<property name="group-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/users/john</D:href><D:href xmlns:D='DAV:'>/users/john2</D:href><D:href xmlns:D='DAV:'>/users/root</D:href>]]></property>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles/guest">
<revision>
<property name="group-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/users/guest</D:href>]]></property>
</revision>
</objectnode>
</objectnode>
<!-- action -->
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions">
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/read">
<revision>
<property name="privilege-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/actions/read-acl</D:href> <D:href xmlns:D='DAV:'>/actions/read-current-user-privilege-set</D:href>]]></property>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/read-acl">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/read-current-user-privilege-set">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write">
<revision>
<property name="privilege-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/actions/write-acl</D:href> <D:href xmlns:D='DAV:'>/actions/write-properties</D:href> <D:href xmlns:D='DAV:'>/actions/write-content</D:href>]]></property>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write-acl">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write-properties">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write-content">
<revision>
<property name="privilege-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/actions/bind</D:href> <D:href xmlns:D='DAV:'>/actions/unbind</D:href>]]></property>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/bind">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/unbind">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
<objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/unlock">
<revision>
<property name="privilege-member-set"/>
</revision>
</objectnode>
</objectnode>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/files">
<permission action="all" subject="unauthenticated" inheritable="true"/>
<permission action="/actions/write" subject="/roles/user" inheritable="true"/>
<permission action="/actions/read-acl" subject="owner" inheritable="true"/>
</objectnode>
<!-- DeltaV: default history and workspace paths -->
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/history">
<permission action="all" subject="unauthenticated" inheritable="true"/>
<permission action="/actions/write" subject="/roles/user" inheritable="true"/>
<permission action="/actions/read-acl" subject="owner" inheritable="true"/>
</objectnode>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/workspace">
<permission action="all" subject="unauthenticated" inheritable="true"/>
<permission action="/actions/write" subject="/roles/user" inheritable="true"/>
<permission action="/actions/read-acl" subject="owner" inheritable="true"/>
</objectnode>
<objectnode classname="org.apache.slide.structure.SubjectNode" uri="/workingresource">
<permission action="all" subject="unauthenticated" inheritable="true"/>
<permission action="/actions/write" subject="/roles/user" inheritable="true"/>
<permission action="/actions/read-acl" subject="owner" inheritable="true"/>
</objectnode>
</objectnode>
</data>
</namespace>
<!--
DeltaV global parameters
========================
* historypath (mandatory=no, default="/history"):
Specifies a Slide path which determines the location where this DeltaV
server stores history data.
* workspacepath (mandatory=no, default="/workspace"):
Specifies a Slide path which determines the location where this DeltaV
server allows workspaces to reside.
* workingresourcepath (mandatory=no, default="/workingresource"):
Specifies a Slide path which determines the location where this DeltaV
server stores working resources.
* auto-version (mandatory=no, default="checkout-checkin"):
Controls the DeltaV auto-version behaviour.
* auto-version-control (mandatory=no, default="false"):
Indicates if a resource just created by a PUT should be set under
version-control.
* versioncontrol-exclude (mandatory=no, default=""):
Specifies a Slide path which determines resources which are excluded from version-control.
The default value "" makes no path being excluded.
* checkout-fork (mandatory=no, default="forbidden"):
Controls the DeltaV check-out behaviour when a version is already
checked-out or has a successor.
* checkin-fork (mandatory=no, default="forbidden"):
Controls the DeltaV check-out behaviour when a version has already a
successor.
* standardLivePropertiesClass (mandatory=no,
default="org.apache.slide.webdav.util.resourcekind.AbstractResourceKind"):
Determines the "agent" knowing about what the standard live properties are.
It should be a loadable class containing the following static methods:
- boolean isLiveProperty(String propName)
- boolean isProtectedProperty(String propName)
- boolean isComputedProperty(String propName)
- Set getAllLiveProperties()
- Set getAllProtectedProperties()
- Set getAllComputedProperties()
* uriRedirectorClass (mandatory=no,
default="org.apache.slide.webdav.util.DeltavUriRedirector"):
Determines the URI redirector class. The DeltaV URI redirector is in
charge of the following redirections:
- version URI to history URI, e.g. /history/2/1.4 to /history/2
- latest revision number for history resource to 0.0
- latest revision number for version resource to last URI token,
e.g. /history/2/1.4 to 1.4
It should be a loadable class containing the following static methods:
- String redirectUri(String uri)
- NodeRevisionNumber redirectLatestRevisionNumber(String uri)
-->
<parameter name="historypath">/history</parameter>
<parameter name="workspacepath">/workspace</parameter>
<parameter name="workingresourcepath">/workingresource</parameter>
<parameter name="auto-version">checkout-checkin</parameter>
<parameter name="auto-version-control">false</parameter>
<parameter name="versioncontrol-exclude"/>
<parameter name="checkout-fork">forbidden</parameter>
<parameter name="checkin-fork">forbidden</parameter>
<!-- Extractor configuration -->
<extractors>
<extractor classname="org.apache.slide.extractor.SimpleXmlExtractor" uri="/files/articles/test.xml">
<configuration>
<instruction property="title" xpath="/article/title/text()" />
<instruction property="summary" xpath="/article/summary/text()" />
</configuration>
</extractor>
<extractor classname="org.apache.slide.extractor.OfficeExtractor" uri="/files/docs/">
<configuration>
<instruction property="author" id="SummaryInformation-0-4" />
<instruction property="application" id="SummaryInformation-0-18" />
</configuration>
</extractor>
<extractor classname="org.apache.slide.extractor.MSWordExtractor" uri="/files/"
content-type="application/ms-word">
</extractor>
<extractor classname="org.apache.slide.extractor.MSExcelExtractor" uri="/files/"
content-type="application/ms-excel">
</extractor>
<extractor classname="org.apache.slide.extractor.MSPowerPointExtractor" uri="/files/"
content-type="application/vnd.ms-powerpoint">
</extractor>
<extractor classname="org.apache.slide.extractor.PDFExtractor" uri="/files/"
content-type="application/pdf">
</extractor>
</extractors>
<!-- Event configuration -->
<events>
<event classname="org.apache.slide.webdav.event.WebdavEvent" enable="true" />
<event classname="org.apache.slide.event.ContentEvent" enable="true" />
<event classname="org.apache.slide.event.ContentEvent" method="retrieve" enable="false" />
<event classname="org.apache.slide.event.EventCollection" enable="true" />
<event classname="org.apache.slide.event.TransactionEvent" enable="true" />
<event classname="org.apache.slide.event.MacroEvent" enable="true"/>
<listener classname="org.apache.slide.util.event.EventLogger" />
<listener classname="org.apache.slide.event.VetoableEventCollector" />
<listener classname="org.apache.slide.event.TransientEventCollector" />
<listener classname="org.apache.slide.webdav.event.NotificationTrigger">
<configuration>
<notification protocol="tcp" include-events="false" />
</configuration>
</listener>
<listener classname="org.apache.slide.extractor.PropertyExtractorTrigger" />
<listener classname="org.apache.slide.search.IndexTrigger">
<configuration>
<indexer classname="org.apache.slide.search.LoggingIndexer" synchronous="false" uri="/files/articles" />
</configuration>
</listener>
<listener classname="org.apache.slide.macro.MacroPropertyUpdater">
<!-- Listener that updates some properties if resources are
copied or moved. This requires MacroEvents enabled (at
least methods copy and move) -->
<configuration>
<update-displayname>true</update-displayname>
<update-owner-on-move>false</update-owner-on-move>
<update-owner-on-copy>true</update-owner-on-copy>
</configuration>
</listener>
</events>
</slide>
---------------------------------------------------------------------
To unsubscribe, e-mail: slide-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: slide-dev-help@jakarta.apache.org