You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by mr...@apache.org on 2008/09/11 09:51:41 UTC
svn commit: r694164 - in
/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene:
AbstractExcerpt.java AbstractIndex.java LazyTextExtractorField.java
NodeIndexer.java SearchIndex.java Util.java
Author: mreutegg
Date: Thu Sep 11 00:51:40 2008
New Revision: 694164
URL: http://svn.apache.org/viewvc?rev=694164&view=rev
Log:
JCR-1730: Background text extraction not possible when supportHighlighting is set true
Added:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java (with props)
Modified:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractIndex.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/Util.java
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java?rev=694164&r1=694163&r2=694164&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractExcerpt.java Thu Sep 11 00:51:40 2008
@@ -26,7 +26,7 @@
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.jackrabbit.core.NodeId;
@@ -98,7 +98,7 @@
} finally {
tDocs.close();
}
- Field[] fields = doc.getFields(FieldNames.FULLTEXT);
+ Fieldable[] fields = doc.getFieldables(FieldNames.FULLTEXT);
if (fields == null) {
log.debug("Fulltext field not stored, using {}",
SimpleExcerptProvider.class.getName());
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractIndex.java?rev=694164&r1=694163&r2=694164&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractIndex.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/AbstractIndex.java Thu Sep 11 00:51:40 2008
@@ -23,6 +23,7 @@
import org.apache.lucene.store.Directory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Similarity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -417,8 +418,8 @@
Document copy = new Document();
Iterator fields = doc.getFields().iterator();
while (fields.hasNext()) {
- Field f = (Field) fields.next();
- Field field = null;
+ Fieldable f = (Fieldable) fields.next();
+ Fieldable field = null;
Field.TermVector tv = getTermVectorParameter(f);
Field.Store stored = getStoreParameter(f);
Field.Index indexed = getIndexParameter(f);
@@ -510,7 +511,7 @@
* @param f a lucene field.
* @return the index parameter on <code>f</code>.
*/
- private Field.Index getIndexParameter(Field f) {
+ private Field.Index getIndexParameter(Fieldable f) {
if (!f.isIndexed()) {
return Field.Index.NO;
} else if (f.isTokenized()) {
@@ -526,7 +527,7 @@
* @param f a lucene field.
* @return the store parameter on <code>f</code>.
*/
- private Field.Store getStoreParameter(Field f) {
+ private Field.Store getStoreParameter(Fieldable f) {
if (f.isCompressed()) {
return Field.Store.COMPRESS;
} else if (f.isStored()) {
@@ -542,7 +543,7 @@
* @param f a lucene field.
* @return the term vector parameter on <code>f</code>.
*/
- private Field.TermVector getTermVectorParameter(Field f) {
+ private Field.TermVector getTermVectorParameter(Fieldable f) {
if (f.isStorePositionWithTermVector() && f.isStoreOffsetWithTermVector()) {
return Field.TermVector.WITH_POSITIONS_OFFSETS;
} else if (f.isStorePositionWithTermVector()) {
Added: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java?rev=694164&view=auto
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java (added)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java Thu Sep 11 00:51:40 2008
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.lucene;
+
+import org.apache.lucene.document.AbstractField;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.commons.io.IOUtils;
+import org.slf4j.LoggerFactory;
+import org.slf4j.Logger;
+
+import java.io.Reader;
+import java.io.IOException;
+
+/**
+ * <code>LazyTextExtractorField</code> implements a Lucene field with a String
+ * value that is lazily initialized from a given {@link Reader}. In addition
+ * this class provides a method to find out whether the purpose of the reader
+ * is to extract text and whether the extraction process is already finished.
+ *
+ * @see #isExtractorFinished()
+ */
+public class LazyTextExtractorField extends AbstractField {
+
+ /**
+ * The serial version UID.
+ */
+ private static final long serialVersionUID = -2707986404659820071L;
+
+ /**
+ * The logger instance for this class.
+ */
+ private static final Logger log = LoggerFactory.getLogger(LazyTextExtractorField.class);
+
+ /**
+ * The reader from where to read the text extract.
+ */
+ private final Reader reader;
+
+ /**
+ * The extract as obtained lazily from {@link #reader}.
+ */
+ private String extract;
+
+ /**
+ * Creates a new <code>LazyTextExtractorField</code> with the given
+ * <code>name</code>.
+ *
+ * @param name the name of the field.
+ * @param reader the reader where to obtain the string from.
+ * @param store when set <code>true</code> the string value is stored in the
+ * index.
+ * @param withOffsets when set <code>true</code> a term vector with offsets
+ * is written into the index.
+ */
+ public LazyTextExtractorField(String name,
+ Reader reader,
+ boolean store,
+ boolean withOffsets) {
+ super(name,
+ store ? Field.Store.YES : Field.Store.NO,
+ Field.Index.TOKENIZED,
+ withOffsets ? Field.TermVector.WITH_OFFSETS : Field.TermVector.NO);
+ this.reader = reader;
+ }
+
+ /**
+ * @return the string value of this field.
+ */
+ public String stringValue() {
+ if (extract == null) {
+ StringBuffer textExtract = new StringBuffer();
+ char[] buffer = new char[1024];
+ int len;
+ try {
+ while ((len = reader.read(buffer)) > -1) {
+ textExtract.append(buffer, 0, len);
+ }
+ } catch (IOException e) {
+ log.warn("Exception reading value for field: "
+ + e.getMessage());
+ log.debug("Dump:", e);
+ } finally {
+ IOUtils.closeQuietly(reader);
+ }
+ extract = textExtract.toString();
+ }
+ return extract;
+ }
+
+ /**
+ * @return always <code>null</code>.
+ */
+ public Reader readerValue() {
+ return null;
+ }
+
+ /**
+ * @return always <code>null</code>.
+ */
+ public byte[] binaryValue() {
+ return null;
+ }
+
+ /**
+ * @return always <code>null</code>.
+ */
+ public TokenStream tokenStreamValue() {
+ return null;
+ }
+
+ /**
+ * @return <code>true</code> if the underlying reader is ready to provide
+ * extracted text.
+ */
+ public boolean isExtractorFinished() {
+ if (reader instanceof TextExtractorReader) {
+ return ((TextExtractorReader) reader).isExtractorFinished();
+ }
+ return true;
+ }
+
+ /**
+ * Disposes this field and closes the underlying reader.
+ *
+ * @throws IOException if an error occurs while closing the reader.
+ */
+ public void dispose() throws IOException {
+ reader.close();
+ }
+}
Propchange: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/LazyTextExtractorField.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?rev=694164&r1=694163&r2=694164&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java Thu Sep 11 00:51:40 2008
@@ -16,7 +16,6 @@
*/
package org.apache.jackrabbit.core.query.lucene;
-import org.apache.commons.io.IOUtils;
import org.apache.jackrabbit.core.PropertyId;
import org.apache.jackrabbit.core.NodeId;
import org.apache.jackrabbit.core.state.ItemStateException;
@@ -37,6 +36,7 @@
import org.slf4j.LoggerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import javax.jcr.NamespaceException;
import javax.jcr.PropertyType;
@@ -44,7 +44,6 @@
import java.io.InputStream;
import java.io.Reader;
-import java.io.IOException;
import java.util.Calendar;
import java.util.Iterator;
import java.util.Set;
@@ -249,7 +248,7 @@
// now add fields that are not used in excerpt (must go at the end)
for (Iterator it = doNotUseInExcerpt.iterator(); it.hasNext(); ) {
- doc.add((Field) it.next());
+ doc.add((Fieldable) it.next());
}
return doc;
}
@@ -761,26 +760,11 @@
* @param value the reader value.
* @return a lucene field.
*/
- protected Field createFulltextField(Reader value) {
+ protected Fieldable createFulltextField(Reader value) {
if (supportHighlighting) {
- // need to create a string value
- StringBuffer textExtract = new StringBuffer();
- char[] buffer = new char[1024];
- int len;
- try {
- while ((len = value.read(buffer)) > -1) {
- textExtract.append(buffer, 0, len);
- }
- } catch (IOException e) {
- log.warn("Exception reading value for fulltext field: "
- + e.getMessage());
- log.debug("Dump:", e);
- } finally {
- IOUtils.closeQuietly(value);
- }
- return createFulltextField(textExtract.toString(), true, true);
+ return new LazyTextExtractorField(FieldNames.FULLTEXT, value, true, true);
} else {
- return new Field(FieldNames.FULLTEXT, value);
+ return new LazyTextExtractorField(FieldNames.FULLTEXT, value, false, false);
}
}
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java?rev=694164&r1=694163&r2=694164&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/SearchIndex.java Thu Sep 11 00:51:40 2008
@@ -55,6 +55,7 @@
import org.apache.lucene.search.Similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.commons.collections.iterators.AbstractIteratorDecorator;
import org.xml.sax.SAXException;
import org.w3c.dom.Element;
@@ -1111,7 +1112,7 @@
getNamespaceMappings(),
index.getIndexFormatVersion());
// transfer fields to doc if there are any
- Field[] fulltextFields = aDoc.getFields(FieldNames.FULLTEXT);
+ Fieldable[] fulltextFields = aDoc.getFieldables(FieldNames.FULLTEXT);
if (fulltextFields != null) {
for (int k = 0; k < fulltextFields.length; k++) {
doc.add(fulltextFields[k]);
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/Util.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/Util.java?rev=694164&r1=694163&r2=694164&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/Util.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/Util.java Thu Sep 11 00:51:40 2008
@@ -17,14 +17,14 @@
package org.apache.jackrabbit.core.query.lucene;
import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.index.Term;
import org.slf4j.LoggerFactory;
import org.slf4j.Logger;
-import java.util.Enumeration;
+import java.util.Iterator;
import java.io.IOException;
/**
@@ -44,15 +44,17 @@
* @param old the document to dispose.
*/
public static void disposeDocument(Document old) {
- Enumeration e = old.fields();
- while (e.hasMoreElements()) {
- Field f = (Field) e.nextElement();
- if (f.readerValue() != null) {
- try {
+ for (Iterator it = old.getFields().iterator(); it.hasNext(); ) {
+ Fieldable f = (Fieldable) it.next();
+ try {
+ if (f.readerValue() != null) {
f.readerValue().close();
- } catch (IOException ex) {
- log.warn("Exception while disposing index document: " + ex);
+ } else if (f instanceof LazyTextExtractorField) {
+ LazyTextExtractorField field = (LazyTextExtractorField) f;
+ field.dispose();
}
+ } catch (IOException ex) {
+ log.warn("Exception while disposing index document: " + ex);
}
}
}
@@ -66,12 +68,11 @@
* otherwise.
*/
public static boolean isDocumentReady(Document doc) {
- Enumeration fields = doc.fields();
- while (fields.hasMoreElements()) {
- Field f = (Field) fields.nextElement();
- if (f.readerValue() instanceof TextExtractorReader) {
- TextExtractorReader r = (TextExtractorReader) f.readerValue();
- if (!r.isExtractorFinished()) {
+ for (Iterator it = doc.getFields().iterator(); it.hasNext(); ) {
+ Fieldable f = (Fieldable) it.next();
+ if (f instanceof LazyTextExtractorField) {
+ LazyTextExtractorField field = (LazyTextExtractorField) f;
+ if (!field.isExtractorFinished()) {
return false;
}
}