You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2009/09/16 15:40:25 UTC
svn commit: r815774 - in /jackrabbit/trunk/jackrabbit-core: ./
src/main/java/org/apache/jackrabbit/core/
src/main/java/org/apache/jackrabbit/core/query/
src/main/java/org/apache/jackrabbit/core/query/lucene/
src/main/resources/org/apache/jackrabbit/cor...
Author: jukka
Date: Wed Sep 16 13:40:24 2009
New Revision: 815774
URL: http://svn.apache.org/viewvc?rev=815774&view=rev
Log:
JCR-1878: Use Apache Tika for text extraction
Replace jackrabbit-text-extractors with a direct Apache Tika dependency.
A simple backwards compatibility layer is included for existing textFilterClasses="..." configurations. Only the org.apache.jackrabbit.extractor classes are recognized and mapped to Tika alternatives.
Added two simple text extraction integration tests.
Added:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java (with props)
jackrabbit/trunk/jackrabbit-core/src/main/resources/org/apache/jackrabbit/core/query/lucene/tika-config.xml
- copied unchanged from r815225, jackrabbit/trunk/jackrabbit-text-extractors/src/main/resources/org/apache/jackrabbit/extractor/tika-config.xml
jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.rtf
- copied unchanged from r815737, jackrabbit/trunk/jackrabbit-text-extractors/src/test/resources/org/apache/jackrabbit/extractor/test.rtf
jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.txt (with props)
Removed:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitTextExtractor.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/PooledTextExtractor.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorJob.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/TextExtractorReader.java
Modified:
jackrabbit/trunk/jackrabbit-core/pom.xml
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/RepositoryImpl.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/SearchManager.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/QueryHandlerContext.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/FulltextQueryTest.java
jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java
Modified: jackrabbit/trunk/jackrabbit-core/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/pom.xml?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/pom.xml (original)
+++ jackrabbit/trunk/jackrabbit-core/pom.xml Wed Sep 16 13:40:24 2009
@@ -193,9 +193,9 @@
<scope>test</scope>
</dependency>
<dependency>
- <groupId>org.apache.jackrabbit</groupId>
- <artifactId>jackrabbit-text-extractors</artifactId>
- <version>2.0-SNAPSHOT</version>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>0.4</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/RepositoryImpl.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/RepositoryImpl.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/RepositoryImpl.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/RepositoryImpl.java Wed Sep 16 13:40:24 2009
@@ -666,9 +666,11 @@
throws RepositoryException {
if (systemSearchMgr == null) {
if (repConfig.getSearchConfig() != null) {
- systemSearchMgr = new SearchManager(repConfig.getSearchConfig(),
- nsReg, ntReg, getWorkspaceInfo(wspName).itemStateMgr,
- vMgr.getPersistenceManager(), SYSTEM_ROOT_NODE_ID, null, null);
+ systemSearchMgr = new SearchManager(
+ repConfig.getSearchConfig(), nsReg, ntReg,
+ getWorkspaceInfo(wspName).itemStateMgr,
+ vMgr.getPersistenceManager(), SYSTEM_ROOT_NODE_ID,
+ null, null, executor);
SystemSession defSysSession = getSystemSession(wspName);
ObservationManager obsMgr = defSysSession.getWorkspace().getObservationManager();
@@ -1838,13 +1840,9 @@
// search manager is lazily instantiated in order to avoid
// 'chicken & egg' bootstrap problems
searchMgr = new SearchManager(config.getSearchConfig(),
- nsReg,
- ntReg,
- itemStateMgr,
- persistMgr,
- rootNodeId,
+ nsReg, ntReg, itemStateMgr, persistMgr, rootNodeId,
getSystemSearchManager(getName()),
- SYSTEM_ROOT_NODE_ID);
+ SYSTEM_ROOT_NODE_ID, executor);
}
return searchMgr;
}
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/SearchManager.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/SearchManager.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/SearchManager.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/SearchManager.java Wed Sep 16 13:40:24 2009
@@ -24,6 +24,7 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.concurrent.Executor;
import javax.jcr.NamespaceException;
import javax.jcr.Node;
@@ -141,6 +142,11 @@
private Path excludePath;
/**
+ * Background task executor.
+ */
+ private final Executor executor;
+
+ /**
* Creates a new <code>SearchManager</code>.
*
* @param config the search configuration.
@@ -163,7 +169,8 @@
PersistenceManager pm,
NodeId rootNodeId,
SearchManager parentMgr,
- NodeId excludedNodeId) throws RepositoryException {
+ NodeId excludedNodeId,
+ Executor executor) throws RepositoryException {
this.fs = config.getFileSystem();
this.config = config;
this.ntReg = ntReg;
@@ -173,6 +180,7 @@
this.rootNodeId = rootNodeId;
this.parentHandler = (parentMgr != null) ? parentMgr.handler : null;
this.excludedNodeId = excludedNodeId;
+ this.executor = executor;
// register namespaces
safeRegisterNamespace(NS_XS_PREFIX, NS_XS_URI);
@@ -505,9 +513,9 @@
// initialize query handler
try {
handler = (QueryHandler) config.newInstance();
- QueryHandlerContext context
- = new QueryHandlerContext(fs, itemMgr, pm, rootNodeId,
- ntReg, nsReg, parentHandler, excludedNodeId);
+ QueryHandlerContext context = new QueryHandlerContext(
+ fs, itemMgr, pm, rootNodeId, ntReg, nsReg,
+ parentHandler, excludedNodeId, executor);
handler.init(context);
} catch (Exception e) {
throw new RepositoryException(e.getMessage(), e);
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/QueryHandlerContext.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/QueryHandlerContext.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/QueryHandlerContext.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/QueryHandlerContext.java Wed Sep 16 13:40:24 2009
@@ -16,6 +16,8 @@
*/
package org.apache.jackrabbit.core.query;
+import java.util.concurrent.Executor;
+
import org.apache.jackrabbit.core.fs.FileSystem;
import org.apache.jackrabbit.core.nodetype.NodeTypeRegistry;
import org.apache.jackrabbit.core.state.ItemStateManager;
@@ -84,6 +86,11 @@
private final NodeId excludedNodeId;
/**
+ * Background task executor.
+ */
+ private final Executor executor;
+
+ /**
* Creates a new context instance.
*
* @param fs a {@link FileSystem} this <code>QueryHandler</code>
@@ -100,6 +107,7 @@
* @param excludedNodeId id of the node that should be excluded from
* indexing. Any descendant of that node is also
* excluded from indexing.
+ * @param executor background task executor
*/
public QueryHandlerContext(FileSystem fs,
SharedItemStateManager stateMgr,
@@ -108,7 +116,8 @@
NodeTypeRegistry ntRegistry,
NamespaceRegistryImpl nsRegistry,
QueryHandler parentHandler,
- NodeId excludedNodeId) {
+ NodeId excludedNodeId,
+ Executor executor) {
this.fs = fs;
this.stateMgr = stateMgr;
this.hmgr = new CachingHierarchyManager(rootId, stateMgr);
@@ -120,6 +129,7 @@
propRegistry = new PropertyTypeRegistry(ntRegistry);
this.parentHandler = parentHandler;
this.excludedNodeId = excludedNodeId;
+ this.executor = executor;
ntRegistry.addListener(propRegistry);
}
@@ -219,4 +229,14 @@
public void destroy() {
ntRegistry.removeListener(propRegistry);
}
+
+ /**
+ * Returns the background task executor.
+ *
+ * @return background task executor
+ */
+ public Executor getExecutor() {
+ return executor;
+ }
+
}
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java?rev=815774&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java Wed Sep 16 13:40:24 2009
@@ -0,0 +1,222 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringTokenizer;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.html.HtmlParser;
+import org.apache.tika.parser.image.ImageParser;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.parser.opendocument.OpenOfficeParser;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.rtf.RTFParser;
+import org.apache.tika.parser.txt.TXTParser;
+import org.apache.tika.parser.xml.XMLParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Jackrabbit wrapper for Tika parsers. Uses a Tika {@link AutoDetectParser}
+ * for all parsing requests, but sets it up with Jackrabbit-specific
+ * configuration and implements backwards compatibility support for old
+ * <code>textExtractorClasses</code> configurations.
+ *
+ * @since Apache Jackrabbit 2.0
+ */
+class JackrabbitParser implements Parser {
+
+ /**
+ * Logger instance.
+ */
+ private static final Logger logger =
+ LoggerFactory.getLogger(JackrabbitParser.class);
+
+ /**
+ * Flag for blocking all text extraction. Used by the Jackrabbit test suite.
+ */
+ private static boolean blocked = false;
+
+ /**
+ * The configured Tika parser.
+ */
+ private final AutoDetectParser parser;
+
+ /**
+ * Creates a parser using the default Jackrabbit-specific configuration
+ * settings.
+ */
+ public JackrabbitParser() {
+ InputStream stream =
+ JackrabbitParser.class.getResourceAsStream("tika-config.xml");
+ try {
+ try {
+ parser = new AutoDetectParser(new TikaConfig(stream));
+ } finally {
+ stream.close();
+ }
+ } catch (Exception e) {
+ // Should never happen
+ throw new RuntimeException(
+ "Unable to load embedded Tika configuration", e);
+ }
+ }
+
+ /**
+ * Backwards compatibility method to support old Jackrabbit 1.x
+ * <code>textExtractorClasses</code> configurations. Implements a best
+ * effort mapping from the old-style text extractor classes to
+ * corresponding Tika parsers.
+ *
+ * @param classes configured list of text extractor classes
+ */
+ public void setTextFilterClasses(String classes) {
+ Map<String, Parser> parsers = new HashMap<String, Parser>();
+
+ StringTokenizer tokenizer = new StringTokenizer(classes, ", \t\n\r\f");
+ while (tokenizer.hasMoreTokens()) {
+ String name = tokenizer.nextToken();
+ if (name.equals(
+ "org.apache.jackrabbit.extractor.HTMLTextExtractor")) {
+ parsers.put("text/html", new HtmlParser());
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.MsExcelTextExtractor")) {
+ Parser parser = new OfficeParser();
+ parsers.put("application/vnd.ms-excel", parser);
+ parsers.put("application/msexcel", parser);
+ parsers.put("application/excel", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.MsOutlookTextExtractor")) {
+ parsers.put("application/vnd.ms-outlook", new OfficeParser());
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.MsPowerPointExtractor")) {
+ Parser parser = new OfficeParser();
+ parsers.put("application/vnd.ms-powerpoint", parser);
+ parsers.put("application/mspowerpoint", parser);
+ parsers.put("application/powerpoint", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.MsWordTextExtractor")) {
+ Parser parser = new OfficeParser();
+ parsers.put("application/vnd.ms-word", parser);
+ parsers.put("application/msword", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.MsTextExtractor")) {
+ Parser parser = new OfficeParser();
+ parsers.put("application/vnd.ms-word", parser);
+ parsers.put("application/msword", parser);
+ parsers.put("application/vnd.ms-powerpoint", parser);
+ parsers.put("application/mspowerpoint", parser);
+ parsers.put("application/vnd.ms-excel", parser);
+ parsers.put("application/vnd.openxmlformats-officedocument.wordprocessingml.document", parser);
+ parsers.put("application/vnd.openxmlformats-officedocument.presentationml.presentation", parser);
+ parsers.put("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.OpenOfficeTextExtractor")) {
+ Parser parser = new OpenOfficeParser();
+ parsers.put("application/vnd.oasis.opendocument.database", parser);
+ parsers.put("application/vnd.oasis.opendocument.formula", parser);
+ parsers.put("application/vnd.oasis.opendocument.graphics", parser);
+ parsers.put("application/vnd.oasis.opendocument.presentation", parser);
+ parsers.put("application/vnd.oasis.opendocument.spreadsheet", parser);
+ parsers.put("application/vnd.oasis.opendocument.text", parser);
+ parsers.put("application/vnd.sun.xml.calc", parser);
+ parsers.put("application/vnd.sun.xml.draw", parser);
+ parsers.put("application/vnd.sun.xml.impress", parser);
+ parsers.put("application/vnd.sun.xml.writer", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.PdfTextExtractor")) {
+ parsers.put("application/pdf", new PDFParser());
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.PlainTextExtractor")) {
+ parsers.put("text/plain", new TXTParser());
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.PngTextExtractor")) {
+ Parser parser = new ImageParser();
+ parsers.put("image/png", parser);
+ parsers.put("image/apng", parser);
+ parsers.put("image/mng", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.RTFTextExtractor")) {
+ Parser parser = new RTFParser();
+ parsers.put("application/rtf", parser);
+ parsers.put("text/rtf", parser);
+ } else if (name.equals(
+ "org.apache.jackrabbit.extractor.XMLTextExtractor")) {
+ Parser parser = new XMLParser();
+ parsers.put("application/xml", parser);
+ parsers.put("text/xml", parser);
+ } else {
+ logger.warn("Ignoring unknown text extractor class: {}", name);
+ }
+ }
+
+ parser.setParsers(parsers);
+ }
+
+ /**
+ * Delegates the call to the configured {@link AutoDetectParser}.
+ */
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata)
+ throws IOException, SAXException, TikaException {
+ waitIfBlocked();
+ parser.parse(stream, handler, metadata);
+ }
+
+ /**
+ * Waits until text extraction is no longer blocked. The block is only
+ * ever activated in the Jackrabbit test suite when testing delayed
+ * text extraction.
+ *
+ * @throws TikaException if the block was interrupted
+ */
+ private synchronized static void waitIfBlocked() throws TikaException {
+ try {
+ while (blocked) {
+ JackrabbitParser.class.wait();
+ }
+ } catch (InterruptedException e) {
+ throw new TikaException("Text extraction block interrupted", e);
+ }
+ }
+
+ /**
+ * Blocks all text extraction tasks.
+ */
+ static synchronized void block() {
+ blocked = true;
+ }
+
+ /**
+ * Unblocks all text extraction tasks.
+ */
+ static synchronized void unblock() {
+ blocked = false;
+ JackrabbitParser.class.notifyAll();
+ }
+
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/JackrabbitParser.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java Wed Sep 16 13:40:24 2009
@@ -16,15 +16,22 @@
*/
package org.apache.jackrabbit.core.query.lucene;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.concurrent.Executor;
+
+import org.apache.jackrabbit.core.value.InternalValue;
+import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.AbstractField;
import org.apache.lucene.document.Field;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.commons.io.IOUtils;
-import org.slf4j.LoggerFactory;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.Field.TermVector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
-
-import java.io.Reader;
-import java.io.IOException;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
/**
* <code>LazyTextExtractorField</code> implements a Lucene field with a String
@@ -37,24 +44,16 @@
public class LazyTextExtractorField extends AbstractField {
/**
- * The serial version UID.
- */
- private static final long serialVersionUID = -2707986404659820071L;
-
- /**
* The logger instance for this class.
*/
- private static final Logger log = LoggerFactory.getLogger(LazyTextExtractorField.class);
+ private static final Logger log =
+ LoggerFactory.getLogger(LazyTextExtractorField.class);
/**
- * The reader from where to read the text extract.
+ * The extracted text content of the given binary value.
+ * Set to non-null when the text extraction task finishes.
*/
- private final Reader reader;
-
- /**
- * The extract as obtained lazily from {@link #reader}.
- */
- private String extract;
+ private volatile String extract = null;
/**
* Creates a new <code>LazyTextExtractorField</code> with the given
@@ -62,84 +61,114 @@
*
* @param name the name of the field.
* @param reader the reader where to obtain the string from.
- * @param store when set <code>true</code> the string value is stored in the
- * index.
- * @param withOffsets when set <code>true</code> a term vector with offsets
- * is written into the index.
- */
- public LazyTextExtractorField(String name,
- Reader reader,
- boolean store,
- boolean withOffsets) {
- super(name,
- store ? Field.Store.YES : Field.Store.NO,
+ * @param highlighting set to <code>true</code> to
+ * enable result highlighting support
+ */
+ public LazyTextExtractorField(
+ Parser parser, InternalValue value, Metadata metadata,
+ Executor executor, boolean highlighting) {
+ super(FieldNames.FULLTEXT,
+ highlighting ? Store.YES : Store.NO,
Field.Index.ANALYZED,
- withOffsets ? Field.TermVector.WITH_OFFSETS : Field.TermVector.NO);
- this.reader = reader;
+ highlighting ? TermVector.WITH_OFFSETS : TermVector.NO);
+ executor.execute(new ParsingTask(parser, value, metadata));
}
/**
- * @return the string value of this field.
+ * Returns the extracted text. This method blocks until the text
+ * extraction task has been completed.
+ *
+ * @return the string value of this field
*/
- public String stringValue() {
- if (extract == null) {
- StringBuffer textExtract = new StringBuffer();
- char[] buffer = new char[1024];
- int len;
- try {
- while ((len = reader.read(buffer)) > -1) {
- textExtract.append(buffer, 0, len);
- }
- } catch (IOException e) {
- log.warn("Exception reading value for field: "
- + e.getMessage());
- log.debug("Dump:", e);
- } finally {
- IOUtils.closeQuietly(reader);
+ public synchronized String stringValue() {
+ try {
+ while (!isExtractorFinished()) {
+ wait();
}
- extract = textExtract.toString();
+ return extract;
+ } catch (InterruptedException e) {
+ log.error("Text extraction thread was interrupted", e);
+ return "";
}
- return extract;
}
/**
- * @return always <code>null</code>.
+ * @return always <code>null</code>
*/
public Reader readerValue() {
return null;
}
/**
- * @return always <code>null</code>.
+ * @return always <code>null</code>
*/
public byte[] binaryValue() {
return null;
}
/**
- * @return always <code>null</code>.
+ * @return always <code>null</code>
*/
public TokenStream tokenStreamValue() {
return null;
}
/**
- * @return <code>true</code> if the underlying reader is ready to provide
- * extracted text.
+ * Checks whether the text extraction task has finished.
+ *
+ * @return <code>true</code> if the extracted text is available
*/
public boolean isExtractorFinished() {
- if (reader instanceof TextExtractorReader) {
- return ((TextExtractorReader) reader).isExtractorFinished();
- }
- return true;
+ return extract != null;
+ }
+
+ private synchronized void setExtractedText(String value) {
+ extract = value;
+ notify();
}
/**
- * Disposes this field and closes the underlying reader.
- *
- * @throws IOException if an error occurs while closing the reader.
+ * Releases all resources associated with this field.
+ */
+ public void dispose() {
+ // TODO: Cause the ContentHandler below to throw an exception
+ }
+
+ /**
+ * The background task for extracting text from a binary value.
*/
- public void dispose() throws IOException {
- reader.close();
+ private class ParsingTask implements Runnable {
+
+ private final Parser parser;
+
+ private final InternalValue value;
+
+ private final Metadata metadata;
+
+ public ParsingTask(
+ Parser parser, InternalValue value, Metadata metadata) {
+ this.parser = parser;
+ this.value = value;
+ this.metadata = metadata;
+ }
+
+ public void run() {
+ ContentHandler handler = new BodyContentHandler();
+ try {
+ InputStream stream = value.getStream();
+ try {
+ parser.parse(stream, handler, metadata);
+ } finally {
+ stream.close();
+ }
+ } catch (Throwable t) {
+ log.warn("Failed to extract text from a binary property", t);
+ } finally {
+ value.discard();
+ }
+ setExtractedText(handler.toString());
+ }
+
}
+
}
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java Wed Sep 16 13:40:24 2009
@@ -16,39 +16,39 @@
*/
package org.apache.jackrabbit.core.query.lucene;
-import org.apache.jackrabbit.core.id.PropertyId;
+import java.math.BigDecimal;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Calendar;
+import java.util.Date;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.Executor;
+
+import javax.jcr.NamespaceException;
+import javax.jcr.PropertyType;
+import javax.jcr.RepositoryException;
+
import org.apache.jackrabbit.core.id.NodeId;
+import org.apache.jackrabbit.core.id.PropertyId;
+import org.apache.jackrabbit.core.state.ChildNodeEntry;
import org.apache.jackrabbit.core.state.ItemStateException;
import org.apache.jackrabbit.core.state.ItemStateManager;
import org.apache.jackrabbit.core.state.NoSuchItemStateException;
import org.apache.jackrabbit.core.state.NodeState;
import org.apache.jackrabbit.core.state.PropertyState;
-import org.apache.jackrabbit.core.state.ChildNodeEntry;
import org.apache.jackrabbit.core.value.InternalValue;
-import org.apache.jackrabbit.extractor.TextExtractor;
-import org.apache.jackrabbit.spi.commons.conversion.NamePathResolver;
-import org.apache.jackrabbit.spi.Path;
import org.apache.jackrabbit.spi.Name;
+import org.apache.jackrabbit.spi.Path;
+import org.apache.jackrabbit.spi.commons.conversion.NamePathResolver;
import org.apache.jackrabbit.spi.commons.name.NameConstants;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
-
-import javax.jcr.NamespaceException;
-import javax.jcr.PropertyType;
-import javax.jcr.RepositoryException;
-
-import java.io.InputStream;
-import java.io.Reader;
-import java.util.Calendar;
-import java.util.Set;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.Date;
-import java.net.URI;
-import java.math.BigDecimal;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.Parser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Creates a lucene <code>Document</code> object from a {@link javax.jcr.Node}.
@@ -87,9 +87,15 @@
protected final NamePathResolver resolver;
/**
- * Content extractor.
+ * Background task executor used for full text extraction.
*/
- protected final TextExtractor extractor;
+ private final Executor executor;
+
+ /**
+ * Parser used for extracting text content from binary properties
+ * for full text indexing.
+ */
+ private final Parser parser;
/**
* The indexing configuration or <code>null</code> if none is available.
@@ -119,17 +125,18 @@
* @param node the node state to index.
* @param stateProvider the persistent item state manager to retrieve properties.
* @param mappings internal namespace mappings.
- * @param extractor content extractor
+ * @param executor background task executor for text extraction
+ * @param parser parser for binary properties
*/
- public NodeIndexer(NodeState node,
- ItemStateManager stateProvider,
- NamespaceMappings mappings,
- TextExtractor extractor) {
+ public NodeIndexer(
+ NodeState node, ItemStateManager stateProvider,
+ NamespaceMappings mappings, Executor executor, Parser parser) {
this.node = node;
this.stateProvider = stateProvider;
this.mappings = mappings;
this.resolver = NamePathResolverImpl.create(mappings);
- this.extractor = extractor;
+ this.executor = executor;
+ this.parser = parser;
}
/**
@@ -413,20 +420,19 @@
return;
}
- InternalValue typeValue = getValue(NameConstants.JCR_MIMETYPE);
- if (typeValue != null) {
- String type = typeValue.getString();
+ InternalValue type = getValue(NameConstants.JCR_MIMETYPE);
+ if (type != null) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, type.getString());
// jcr:encoding is not mandatory
- String encoding = null;
- InternalValue encodingValue = getValue(NameConstants.JCR_ENCODING);
- if (encodingValue != null) {
- encoding = encodingValue.getString();
+ InternalValue encoding = getValue(NameConstants.JCR_ENCODING);
+ if (encoding != null) {
+ metadata.set(
+ Metadata.CONTENT_ENCODING, encoding.getString());
}
- InputStream stream = internalValue.getStream();
- Reader reader = extractor.extractText(stream, type, encoding);
- doc.add(createFulltextField(reader));
+ doc.add(createFulltextField(internalValue, metadata));
}
} catch (Throwable t) {
// TODO: How to recover from a transient indexing failure?
@@ -805,15 +811,14 @@
/**
* Creates a fulltext field for the reader <code>value</code>.
*
- * @param value the reader value.
+ * @param value the binary value
+ * @param metadata document metatadata
* @return a lucene field.
*/
- protected Fieldable createFulltextField(Reader value) {
- if (supportHighlighting) {
- return new LazyTextExtractorField(FieldNames.FULLTEXT, value, true, true);
- } else {
- return new LazyTextExtractorField(FieldNames.FULLTEXT, value, false, false);
- }
+ protected Fieldable createFulltextField(
+ InternalValue value, Metadata metadata) {
+ return new LazyTextExtractorField(
+ parser, value, metadata, executor, supportHighlighting);
}
/**
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java Wed Sep 16 13:40:24 2009
@@ -16,73 +16,73 @@
*/
package org.apache.jackrabbit.core.query.lucene;
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import javax.jcr.RepositoryException;
+import javax.jcr.query.InvalidQueryException;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.apache.jackrabbit.core.HierarchyManager;
import org.apache.jackrabbit.core.ItemManager;
import org.apache.jackrabbit.core.SessionImpl;
-import org.apache.jackrabbit.core.id.NodeId;
-import org.apache.jackrabbit.core.HierarchyManager;
import org.apache.jackrabbit.core.fs.FileSystem;
-import org.apache.jackrabbit.core.fs.FileSystemResource;
import org.apache.jackrabbit.core.fs.FileSystemException;
+import org.apache.jackrabbit.core.fs.FileSystemResource;
import org.apache.jackrabbit.core.fs.local.LocalFileSystem;
+import org.apache.jackrabbit.core.id.NodeId;
import org.apache.jackrabbit.core.query.AbstractQueryHandler;
import org.apache.jackrabbit.core.query.ExecutableQuery;
import org.apache.jackrabbit.core.query.QueryHandler;
import org.apache.jackrabbit.core.query.QueryHandlerContext;
import org.apache.jackrabbit.core.query.lucene.directory.DirectoryManager;
import org.apache.jackrabbit.core.query.lucene.directory.FSDirectoryManager;
-import org.apache.jackrabbit.core.state.NodeState;
+import org.apache.jackrabbit.core.state.ItemStateException;
import org.apache.jackrabbit.core.state.ItemStateManager;
+import org.apache.jackrabbit.core.state.NodeState;
import org.apache.jackrabbit.core.state.PropertyState;
-import org.apache.jackrabbit.core.state.ItemStateException;
-import org.apache.jackrabbit.extractor.DefaultTextExtractor;
-import org.apache.jackrabbit.extractor.TextExtractor;
import org.apache.jackrabbit.spi.Name;
import org.apache.jackrabbit.spi.Path;
import org.apache.jackrabbit.spi.PathFactory;
import org.apache.jackrabbit.spi.commons.name.NameConstants;
import org.apache.jackrabbit.spi.commons.name.PathFactoryImpl;
import org.apache.jackrabbit.spi.commons.query.DefaultQueryNodeFactory;
-import org.apache.jackrabbit.spi.commons.query.qom.QueryObjectModelTree;
import org.apache.jackrabbit.spi.commons.query.qom.OrderingImpl;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.jackrabbit.spi.commons.query.qom.QueryObjectModelTree;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.search.HitCollector;
+import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Sort;
-import org.apache.lucene.search.SortField;
import org.apache.lucene.search.Similarity;
+import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortComparatorSource;
-import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.HitCollector;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.Fieldable;
-import org.xml.sax.SAXException;
+import org.apache.tika.parser.Parser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.Element;
-
-import javax.jcr.RepositoryException;
-import javax.jcr.query.InvalidQueryException;
-import javax.xml.parsers.DocumentBuilderFactory;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.ParserConfigurationException;
-import java.io.IOException;
-import java.io.File;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Collection;
+import org.xml.sax.SAXException;
/**
* Implements a {@link org.apache.jackrabbit.core.query.QueryHandler} using
@@ -207,20 +207,12 @@
/**
* The analyzer we use for indexing.
*/
- private JackrabbitAnalyzer analyzer;
+ private final JackrabbitAnalyzer analyzer = new JackrabbitAnalyzer();
/**
- * List of text extractor and text filter class names. The configured
- * classes will be instantiated and used to extract text content from
- * binary properties.
+ * The parser for extracting text content from binary properties.
*/
- private String textFilterClasses =
- DefaultTextExtractor.class.getName();
-
- /**
- * Text extractor for extracting text content of binary properties.
- */
- private TextExtractor extractor;
+ private final JackrabbitParser parser = new JackrabbitParser();
/**
* The namespace mappings used internally.
@@ -476,13 +468,6 @@
private boolean closed = false;
/**
- * Default constructor.
- */
- public SearchIndex() {
- this.analyzer = new JackrabbitAnalyzer();
- }
-
- /**
* Initializes this <code>QueryHandler</code>. This implementation requires
* that a path parameter is set in the configuration. If this condition
* is not met, a <code>IOException</code> is thrown.
@@ -500,7 +485,6 @@
excludedIDs.add(context.getExcludedNodeId());
}
- extractor = createTextExtractor();
synProvider = createSynonymProvider();
directoryManager = createDirectoryManager();
redoLogFactory = createRedoLogFactory();
@@ -792,10 +776,6 @@
log.warn("Exception while closing FileSystem", e);
}
}
- // shutdown extractor
- if (extractor instanceof PooledTextExtractor) {
- ((PooledTextExtractor) extractor).shutdown();
- }
if (spellChecker != null) {
spellChecker.close();
}
@@ -910,12 +890,13 @@
}
/**
- * Returns the text extractor in use for indexing.
+ * Returns the parser used for extracting text content
+ * from binary properties for full text indexing.
*
- * @return the text extractor in use for indexing.
+ * @return the configured parser
*/
- public TextExtractor getTextExtractor() {
- return extractor;
+ public Parser getParser() {
+ return parser;
}
/**
@@ -1114,8 +1095,9 @@
NamespaceMappings nsMappings,
IndexFormatVersion indexFormatVersion)
throws RepositoryException {
- NodeIndexer indexer = new NodeIndexer(node,
- getContext().getItemStateManager(), nsMappings, extractor);
+ NodeIndexer indexer = new NodeIndexer(
+ node, getContext().getItemStateManager(), nsMappings,
+ getContext().getExecutor(), parser);
indexer.setSupportHighlighting(supportHighlighting);
indexer.setIndexingConfiguration(indexingConfig);
indexer.setIndexFormatVersion(indexFormatVersion);
@@ -1141,21 +1123,6 @@
}
/**
- * Factory method to create the <code>TextExtractor</code> instance.
- *
- * @return the <code>TextExtractor</code> instance this index should use.
- */
- protected TextExtractor createTextExtractor() {
- TextExtractor txtExtr = new JackrabbitTextExtractor(textFilterClasses);
- if (extractorPoolSize > 0) {
- // wrap with pool
- txtExtr = new PooledTextExtractor(txtExtr, extractorPoolSize,
- extractorBackLog, extractorTimeout);
- }
- return txtExtr;
- }
-
- /**
* @param namespaceMappings The namespace mappings
* @return the fulltext indexing configuration or <code>null</code> if there
* is no configuration.
@@ -1881,9 +1848,10 @@
* constructor.
*
* @param filterClasses comma separated list of class names
+ * @deprecated
*/
public void setTextFilterClasses(String filterClasses) {
- this.textFilterClasses = filterClasses;
+ parser.setTextFilterClasses(filterClasses);
}
/**
@@ -1891,9 +1859,10 @@
* currently in use. The names are comma separated.
*
* @return class names of the text filters in use.
+ * @deprecated
*/
public String getTextFilterClasses() {
- return textFilterClasses;
+ return "deprectated";
}
/**
Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/FulltextQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/FulltextQueryTest.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/FulltextQueryTest.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/FulltextQueryTest.java Wed Sep 16 13:40:24 2009
@@ -16,11 +16,18 @@
*/
package org.apache.jackrabbit.core.query;
+import java.io.IOException;
+import java.io.InputStream;
+
import javax.jcr.Node;
import javax.jcr.RepositoryException;
+import javax.jcr.nodetype.NodeType;
+import javax.jcr.query.InvalidQueryException;
import javax.jcr.query.Query;
import javax.jcr.query.QueryResult;
+import org.apache.commons.io.IOUtils;
+
/**
* Performs tests with the <code>CONTAINS</code> function.
*/
@@ -276,6 +283,39 @@
testRootNode.addNode(nodeName1).setProperty("text", content);
testRootNode.save();
+ assertContainsQuery(statement, match);
+ }
+
+ public void testFileContains() throws Exception {
+ assertFileContains(
+ "test.txt", "text/plain", "AE502DBEA2C411DEBD340AD156D89593");
+ assertFileContains(
+ "test.rtf", "text/rtf", "quick brown fox");
+ }
+
+ private void assertFileContains(
+ String name, String type, String... statements) throws Exception {
+ while (testRootNode.hasNode(nodeName1)) {
+ testRootNode.getNode(nodeName1).remove();
+ }
+ Node resource = testRootNode.addNode(nodeName1, NodeType.NT_RESOURCE);
+ resource.setProperty("jcr:mimeType", type);
+ InputStream stream = FulltextQueryTest.class.getResourceAsStream(name);
+ try {
+ resource.setProperty("jcr:data", stream);
+ } finally {
+ stream.close();
+ }
+ testRootNode.save();
+ getSearchIndex().flush();
+
+ for (String statement : statements) {
+ assertContainsQuery(statement, true);
+ }
+ }
+
+ private void assertContainsQuery(String statement, boolean match)
+ throws InvalidQueryException, RepositoryException {
StringBuffer stmt = new StringBuffer();
stmt.append("/jcr:root").append(testRoot).append("/*");
stmt.append("[jcr:contains(., '").append(statement);
@@ -292,4 +332,5 @@
q = superuser.getWorkspace().getQueryManager().createQuery(stmt.toString(), Query.SQL);
checkResult(q.execute(), match ? 1 : 0);
}
+
}
Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java?rev=815774&r1=815773&r2=815774&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/IndexingQueueTest.java Wed Sep 16 13:40:24 2009
@@ -16,24 +16,22 @@
*/
package org.apache.jackrabbit.core.query.lucene;
-import org.apache.jackrabbit.extractor.TextExtractor;
-import org.apache.jackrabbit.core.query.AbstractIndexingTest;
-import org.apache.jackrabbit.core.RepositoryImpl;
-import org.apache.jackrabbit.core.TestHelper;
-import org.apache.jackrabbit.core.fs.local.FileUtil;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Calendar;
import javax.jcr.Node;
import javax.jcr.NodeIterator;
import javax.jcr.RepositoryException;
import javax.jcr.query.Query;
-import java.io.Reader;
-import java.io.InputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.ByteArrayInputStream;
-import java.io.File;
-import java.io.FilenameFilter;
-import java.util.Calendar;
+
+import org.apache.jackrabbit.core.RepositoryImpl;
+import org.apache.jackrabbit.core.TestHelper;
+import org.apache.jackrabbit.core.fs.local.FileUtil;
+import org.apache.jackrabbit.core.query.AbstractIndexingTest;
/**
* <code>IndexingQueueTest</code> checks if the indexing queue properly indexes
@@ -44,15 +42,15 @@
private static final File TEMP_DIR = new File(System.getProperty("java.io.tmpdir"));
- private static final String CONTENT_TYPE = "application/indexing-queue-test";
+ private static final String CONTENT_TYPE = "text/plain";
private static final String ENCODING = "UTF-8";
public void testQueue() throws Exception {
- Extractor.sleepTime = 200;
SearchIndex index = getSearchIndex();
IndexingQueue queue = index.getIndex().getIndexingQueue();
+ JackrabbitParser.block();
assertEquals(0, queue.getNumPendingDocuments());
String text = "the quick brown fox jumps over the lazy dog.";
@@ -70,6 +68,7 @@
NodeIterator nodes = q.execute().getNodes();
assertFalse(nodes.hasNext());
+ JackrabbitParser.unblock();
index.flush();
assertEquals(0, queue.getNumPendingDocuments());
@@ -79,7 +78,7 @@
}
public void testInitialIndex() throws Exception {
- Extractor.sleepTime = 200;
+ JackrabbitParser.block();
File indexDir = new File(getSearchIndex().getPath());
// fill workspace
@@ -105,7 +104,7 @@
int initialNumExtractorFiles = getNumExtractorFiles();
- Extractor.sleepTime = 20;
+ JackrabbitParser.unblock();
Thread t = new Thread(new Runnable() {
public void run() {
try {
@@ -140,7 +139,7 @@
* Test case for JCR-2082
*/
public void testReaderUpToDate() throws Exception {
- Extractor.sleepTime = 10;
+ JackrabbitParser.block();
SearchIndex index = getSearchIndex();
File indexDir = new File(index.getPath());
@@ -159,6 +158,7 @@
fail("Unable to delete index directory");
}
+ JackrabbitParser.unblock();
// start workspace again by getting a session
session = getHelper().getSuperuserSession(WORKSPACE_NAME);
@@ -202,22 +202,4 @@
}).length;
}
- public static final class Extractor implements TextExtractor {
-
- protected static volatile int sleepTime = 200;
-
- public String[] getContentTypes() {
- return new String[]{CONTENT_TYPE};
- }
-
- public Reader extractText(InputStream stream, String type, String encoding)
- throws IOException {
- try {
- Thread.sleep(sleepTime);
- } catch (InterruptedException e) {
- throw new IOException();
- }
- return new InputStreamReader(stream, encoding);
- }
- }
}
Added: jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.txt
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.txt?rev=815774&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.txt (added)
+++ jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.txt Wed Sep 16 13:40:24 2009
@@ -0,0 +1,3 @@
+This is a test of Jackrabbit full text indexing. The following
+unique token should be included in the search index once this
+file has been indexed: AE502DBEA2C411DEBD340AD156D89593
Propchange: jackrabbit/trunk/jackrabbit-core/src/test/resources/org/apache/jackrabbit/core/query/test.txt
------------------------------------------------------------------------------
svn:eol-style = native