You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by ju...@apache.org on 2010/02/23 15:32:15 UTC
svn commit: r915356 - in
/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene:
LazyTextExtractorField.java NodeIndexer.java SearchIndex.java
Author: jukka
Date: Tue Feb 23 14:32:15 2010
New Revision: 915356
URL: http://svn.apache.org/viewvc?rev=915356&view=rev
Log:
JCR-2506: Stop text extraction when the maxFieldLength limit is reached
Add a maxExtractLength configuration option, and default it to ten times maxFieldLength. This prevents excessive memory use when indexing huge documents.
Modified:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java?rev=915356&r1=915355&r2=915356&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java Tue Feb 23 14:32:15 2010
@@ -27,11 +27,12 @@
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
-import org.apache.tika.sax.BodyContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
* <code>LazyTextExtractorField</code> implements a Lucene field with a String
@@ -50,6 +51,13 @@
LoggerFactory.getLogger(LazyTextExtractorField.class);
/**
+ * The exception used to forcibly terminate the extraction process
+ * when the maximum field length is reached.
+ */
+ private static final SAXException STOP =
+ new SAXException("max field length reached");
+
+ /**
* The extracted text content of the given binary value.
* Set to non-null when the text extraction task finishes.
*/
@@ -66,12 +74,13 @@
*/
public LazyTextExtractorField(
Parser parser, InternalValue value, Metadata metadata,
- Executor executor, boolean highlighting) {
+ Executor executor, boolean highlighting, int maxFieldLength) {
super(FieldNames.FULLTEXT,
highlighting ? Store.YES : Store.NO,
Field.Index.ANALYZED,
highlighting ? TermVector.WITH_OFFSETS : TermVector.NO);
- executor.execute(new ParsingTask(parser, value, metadata));
+ executor.execute(
+ new ParsingTask(parser, value, metadata, maxFieldLength));
}
/**
@@ -137,7 +146,7 @@
/**
* The background task for extracting text from a binary value.
*/
- private class ParsingTask implements Runnable {
+ private class ParsingTask extends DefaultHandler implements Runnable {
private final Parser parser;
@@ -145,28 +154,52 @@
private final Metadata metadata;
+ private final int maxFieldLength;
+
+ private final StringBuilder builder = new StringBuilder();
+
public ParsingTask(
- Parser parser, InternalValue value, Metadata metadata) {
+ Parser parser, InternalValue value, Metadata metadata,
+ int maxFieldLength) {
this.parser = parser;
this.value = value;
this.metadata = metadata;
+ this.maxFieldLength = maxFieldLength;
}
public void run() {
- ContentHandler handler = new BodyContentHandler();
try {
InputStream stream = value.getStream();
try {
- parser.parse(stream, handler, metadata);
+ parser.parse(stream, this, metadata, new ParseContext());
} finally {
stream.close();
}
} catch (Throwable t) {
- log.warn("Failed to extract text from a binary property", t);
+ if (t != STOP) {
+ log.warn("Failed to extract text from a binary property", t);
+ }
} finally {
value.discard();
}
- setExtractedText(handler.toString());
+ setExtractedText(builder.toString());
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ builder.append(
+ ch, start,
+ Math.min(length, maxFieldLength - builder.length()));
+ if (builder.length() >= maxFieldLength) {
+ throw STOP;
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length)
+ throws SAXException {
+ characters(ch, start, length);
}
}
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?rev=915356&r1=915355&r2=915356&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java Tue Feb 23 14:32:15 2010
@@ -120,6 +120,11 @@
protected List<Fieldable> doNotUseInExcerpt = new ArrayList<Fieldable>();
/**
+ * The maximum number of characters to extract from binaries.
+ */
+ private int maxExtractLength = Integer.MAX_VALUE;
+
+ /**
* Creates a new node indexer.
*
* @param node the node state to index.
@@ -176,6 +181,24 @@
}
/**
+ * Returns the maximum number of characters to extract from binaries.
+ *
+ * @return maximum extraction length
+ */
+ public int getMaxExtractLength() {
+ return maxExtractLength;
+ }
+
+ /**
+ * Sets the maximum number of characters to extract from binaries.
+ *
+ * @param length maximum extraction length
+ */
+ public void setMaxExtractLength(int length) {
+ this.maxExtractLength = length;
+ }
+
+ /**
* Creates a lucene Document.
*
* @return the lucene Document with the index layout.
@@ -818,7 +841,8 @@
protected Fieldable createFulltextField(
InternalValue value, Metadata metadata) {
return new LazyTextExtractorField(
- parser, value, metadata, executor, supportHighlighting);
+ parser, value, metadata, executor,
+ supportHighlighting, getMaxExtractLength());
}
/**
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?rev=915356&r1=915355&r2=915356&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java Tue Feb 23 14:32:15 2010
@@ -260,6 +260,13 @@
private int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH;
/**
+ * maxExtractLength config parameter. Positive values are used as-is,
+ * negative values are interpreted as factors of the maxFieldLength
+ * parameter.
+ */
+ private int maxExtractLength = -10;
+
+ /**
* extractorPoolSize config parameter
*/
private int extractorPoolSize = 2 * Runtime.getRuntime().availableProcessors();
@@ -1092,6 +1099,7 @@
indexer.setSupportHighlighting(supportHighlighting);
indexer.setIndexingConfiguration(indexingConfig);
indexer.setIndexFormatVersion(indexFormatVersion);
+ indexer.setMaxExtractLength(getMaxExtractLength());
Document doc = indexer.createDoc();
mergeAggregatedNodeIndexes(node, doc, indexFormatVersion);
return doc;
@@ -1832,6 +1840,18 @@
return maxFieldLength;
}
+ public void setMaxExtractLength(int length) {
+ maxExtractLength = length;
+ }
+
+ public int getMaxExtractLength() {
+ if (maxExtractLength < 0) {
+ return -maxExtractLength * maxFieldLength;
+ } else {
+ return maxExtractLength;
+ }
+ }
+
/**
* Sets the list of text extractors (and text filters) to use for
* extracting text content from binary properties. The list must be