You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/13 13:28:11 UTC
svn commit: r1690637 - in /jackrabbit/oak/trunk/oak-lucene/src:
main/java/org/apache/jackrabbit/oak/plugins/index/lucene/
test/java/org/apache/jackrabbit/oak/plugins/index/lucene/
Author: chetanm
Date: Mon Jul 13 11:28:11 2015
New Revision: 1690637
URL: http://svn.apache.org/r1690637
Log:
OAK-2892 - Speed up lucene indexing post migration by pre extracting the text content from binaries
Make use of PreExtractedTextProvider in oak-lucene
-- Exposed an MBean to provide some stats around text extraction
-- Wire up PreExtractedTextProvider via OSGi
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java (with props)
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java (with props)
Modified:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
Added: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java?rev=1690637&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java Mon Jul 13 11:28:11 2015
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene;
+
+import java.io.IOException;
+
+import javax.annotation.CheckForNull;
+
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.commons.IOUtils;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.jackrabbit.oak.commons.PathUtils.concat;
+
+class ExtractedTextCache {
+ private static final String EMPTY_STRING = "";
+ private final Logger log = LoggerFactory.getLogger(getClass());
+ private volatile PreExtractedTextProvider extractedTextProvider;
+ private int textExtractionCount;
+ private long totalBytesRead;
+ private long totalTextSize;
+ private long totalTime;
+ private int preFetchedCount;
+
+ /**
+ * Get the pre extracted text for given blob
+ * @return null if no pre extracted text entry found. Otherwise returns the pre extracted
+ * text
+ */
+ @CheckForNull
+ public String get(String nodePath, String propertyName, Blob blob, boolean reindexMode){
+ String result = null;
+ //Consult the PreExtractedTextProvider only in reindex mode and not in
+ //incremental indexing mode. As that would only contain older entries
+ //That also avoid loading on various state (See DataStoreTextWriter)
+ if (reindexMode && extractedTextProvider != null){
+ String propertyPath = concat(nodePath, propertyName);
+ try {
+ ExtractedText text = extractedTextProvider.getText(propertyPath, blob);
+ if (text != null) {
+ preFetchedCount++;
+ switch (text.getExtractionResult()) {
+ case SUCCESS:
+ result = text.getExtractedText().toString();
+ break;
+ case ERROR:
+ result = LuceneIndexEditor.TEXT_EXTRACTION_ERROR;
+ break;
+ case EMPTY:
+ result = EMPTY_STRING;
+ break;
+ }
+ }
+ } catch (IOException e) {
+ log.warn("Error occurred while fetching pre extracted text for {}", propertyPath, e);
+ }
+ }
+ return result;
+ }
+
+ public void put(Blob blob, ExtractedText extractedText){
+
+ }
+
+ public void addStats(int count, long timeInMillis, long bytesRead, long textLength){
+ this.textExtractionCount += count;
+ this.totalTime += timeInMillis;
+ this.totalBytesRead += bytesRead;
+ this.totalTextSize += textLength;
+ }
+
+ public TextExtractionStatsMBean getStatsMBean(){
+ return new TextExtractionStatsMBean() {
+ @Override
+ public boolean isPreExtractedTextProviderConfigured() {
+ return extractedTextProvider != null;
+ }
+
+ @Override
+ public int getTextExtractionCount() {
+ return textExtractionCount;
+ }
+
+ @Override
+ public long getTotalTime() {
+ return totalTime;
+ }
+
+ @Override
+ public int getPreFetchedCount() {
+ return preFetchedCount;
+ }
+
+ @Override
+ public String getExtractedTextSize() {
+ return IOUtils.humanReadableByteCount(totalTextSize);
+ }
+
+ @Override
+ public String getBytesRead() {
+ return IOUtils.humanReadableByteCount(totalBytesRead);
+ }
+ };
+ }
+
+ public void setExtractedTextProvider(PreExtractedTextProvider extractedTextProvider) {
+ this.extractedTextProvider = extractedTextProvider;
+ }
+
+ public PreExtractedTextProvider getExtractedTextProvider() {
+ return extractedTextProvider;
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java Mon Jul 13 11:28:11 2015
@@ -52,6 +52,8 @@ import org.apache.jackrabbit.oak.commons
import org.apache.jackrabbit.oak.plugins.index.IndexEditor;
import org.apache.jackrabbit.oak.plugins.index.IndexUpdateCallback;
import org.apache.jackrabbit.oak.plugins.index.PathFilter;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
import org.apache.jackrabbit.oak.plugins.index.lucene.Aggregate.Matcher;
import org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState;
import org.apache.jackrabbit.oak.plugins.tree.TreeFactory;
@@ -86,6 +88,7 @@ public class LuceneIndexEditor implement
private static final Logger log =
LoggerFactory.getLogger(LuceneIndexEditor.class);
+ static final String TEXT_EXTRACTION_ERROR = "TextExtractionError";
private final LuceneIndexEditorContext context;
@@ -122,12 +125,14 @@ public class LuceneIndexEditor implement
private final PathFilter.Result pathFilterResult;
LuceneIndexEditor(NodeState root, NodeBuilder definition,
- IndexUpdateCallback updateCallback,@Nullable IndexCopier indexCopier) throws CommitFailedException {
+ IndexUpdateCallback updateCallback,
+ @Nullable IndexCopier indexCopier,
+ ExtractedTextCache extractedTextCache) throws CommitFailedException {
this.parent = null;
this.name = null;
this.path = "/";
this.context = new LuceneIndexEditorContext(root, definition,
- updateCallback, indexCopier);
+ updateCallback, indexCopier, extractedTextCache);
this.root = root;
this.isDeleted = false;
this.matcherState = MatcherState.NONE;
@@ -554,12 +559,16 @@ public class LuceneIndexEditor implement
}
for (Blob v : property.getValue(Type.BINARIES)) {
+ String value = parseStringValue(v, metadata, path, property.getName());
+ if (value == null){
+ continue;
+ }
+
if (nodePath != null){
- fields.add(newFulltextField(nodePath, parseStringValue(v, metadata, path)));
+ fields.add(newFulltextField(nodePath, value));
} else {
- fields.add(newFulltextField(parseStringValue(v, metadata, path)));
+ fields.add(newFulltextField(value));
}
-
}
return fields;
}
@@ -832,16 +841,24 @@ public class LuceneIndexEditor implement
return context.isSupportedMediaType(type);
}
- private String parseStringValue(Blob v, Metadata metadata, String path) {
+ private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
+ String text = context.getExtractedTextCache().get(path, propertyName, v, context.isReindex());
+ if (text == null){
+ text = parseStringValue0(v, metadata, path);
+ }
+ return text;
+ }
+
+ private String parseStringValue0(Blob v, Metadata metadata, String path) {
WriteOutContentHandler handler = new WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
long start = System.currentTimeMillis();
- long size = 0;
+ long bytesRead = 0;
try {
CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
try {
context.getParser().parse(stream, handler, metadata, new ParseContext());
} finally {
- size = stream.getCount();
+ bytesRead = stream.getCount();
stream.close();
}
} catch (LinkageError e) {
@@ -859,11 +876,15 @@ public class LuceneIndexEditor implement
+ " worry about. The stack trace is included to"
+ " help improve the text extraction feature.",
getIndexName(), path, t);
- return "TextExtractionError";
+ context.getExtractedTextCache().put(v, ExtractedText.ERROR);
+ return TEXT_EXTRACTION_ERROR;
}
}
String result = handler.toString();
- context.recordTextExtractionStats(System.currentTimeMillis() - start, size);
+ if (bytesRead > 0) {
+ context.recordTextExtractionStats(System.currentTimeMillis() - start, bytesRead, result.length());
+ }
+ context.getExtractedTextCache().put(v, new ExtractedText(ExtractionResult.SUCCESS, result));
return result;
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java Mon Jul 13 11:28:11 2015
@@ -131,18 +131,20 @@ public class LuceneIndexEditorContext {
private final TextExtractionStats textExtractionStats = new TextExtractionStats();
+ private final ExtractedTextCache extractedTextCache;
/**
* The media types supported by the parser used.
*/
private Set<MediaType> supportedMediaTypes;
LuceneIndexEditorContext(NodeState root, NodeBuilder definition, IndexUpdateCallback updateCallback,
- @Nullable IndexCopier indexCopier) {
+ @Nullable IndexCopier indexCopier, ExtractedTextCache extractedTextCache) {
this.definitionBuilder = definition;
this.indexCopier = indexCopier;
this.definition = new IndexDefinition(root, definition);
this.indexedNodes = 0;
this.updateCallback = updateCallback;
+ this.extractedTextCache = extractedTextCache;
if (this.definition.isOfOldFormat()){
IndexDefinition.updateDefinition(definition);
}
@@ -201,6 +203,7 @@ public class LuceneIndexEditorContext {
PERF_LOGGER.end(start, -1, "Closed IndexWriter for directory {}", definition);
textExtractionStats.log(reindex);
+ textExtractionStats.collectStats(extractedTextCache);
}
}
@@ -270,8 +273,22 @@ public class LuceneIndexEditorContext {
return definition;
}
- public void recordTextExtractionStats(long timeInMillis, long size) {
- textExtractionStats.addStats(timeInMillis, size);
+ @Deprecated
+ public void recordTextExtractionStats(long timeInMillis, long bytesRead) {
+ //Keeping deprecated method to avoid major version change
+ recordTextExtractionStats(timeInMillis, bytesRead, 0);
+ }
+
+ public void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) {
+ textExtractionStats.addStats(timeInMillis, bytesRead, textLength);
+ }
+
+ ExtractedTextCache getExtractedTextCache() {
+ return extractedTextCache;
+ }
+
+ public boolean isReindex() {
+ return reindex;
}
private static Parser initializeTikaParser(IndexDefinition definition) {
@@ -330,13 +347,15 @@ public class LuceneIndexEditorContext {
*/
private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(2);
private int count;
- private long totalSize;
+ private long totalBytesRead;
private long totalTime;
+ private long totalTextLength;
- public void addStats(long timeInMillis, long size) {
+ public void addStats(long timeInMillis, long bytesRead, int textLength) {
count++;
- totalSize += size;
+ totalBytesRead += bytesRead;
totalTime += timeInMillis;
+ totalTextLength += textLength;
}
public void log(boolean reindex) {
@@ -347,6 +366,10 @@ public class LuceneIndexEditorContext {
}
}
+ public void collectStats(ExtractedTextCache cache){
+ cache.addStats(count, totalTime, totalBytesRead, totalTextLength);
+ }
+
private boolean isTakingLotsOfTime() {
return totalTime > LOGGING_THRESHOLD;
}
@@ -357,8 +380,8 @@ public class LuceneIndexEditorContext {
@Override
public String toString() {
- return String.format(" %d (%s, %s)", count,
- timeInWords(totalTime), humanReadableByteCount(totalSize));
+ return String.format(" %d (Time Taken %s, Bytes Read %s, Extracted text size %d)", count,
+ timeInWords(totalTime), humanReadableByteCount(totalBytesRead), totalTextLength);
}
private static String timeInWords(long millis) {
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java Mon Jul 13 11:28:11 2015
@@ -38,13 +38,20 @@ import org.apache.jackrabbit.oak.spi.sta
*/
public class LuceneIndexEditorProvider implements IndexEditorProvider {
private final IndexCopier indexCopier;
+ private final ExtractedTextCache extractedTextCache;
public LuceneIndexEditorProvider() {
this(null);
}
public LuceneIndexEditorProvider(@Nullable IndexCopier indexCopier) {
+ this(indexCopier, new ExtractedTextCache());
+ }
+
+ public LuceneIndexEditorProvider(@Nullable IndexCopier indexCopier,
+ ExtractedTextCache extractedTextCache) {
this.indexCopier = indexCopier;
+ this.extractedTextCache = extractedTextCache;
}
@Override
@@ -53,7 +60,7 @@ public class LuceneIndexEditorProvider i
@Nonnull IndexUpdateCallback callback)
throws CommitFailedException {
if (TYPE_LUCENE.equals(type)) {
- return new LuceneIndexEditor(root, definition, callback, indexCopier);
+ return new LuceneIndexEditor(root, definition, callback, indexCopier, extractedTextCache);
}
return null;
}
@@ -61,4 +68,8 @@ public class LuceneIndexEditorProvider i
IndexCopier getIndexCopier() {
return indexCopier;
}
+
+ ExtractedTextCache getExtractedTextCache() {
+ return extractedTextCache;
+ }
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java Mon Jul 13 11:28:11 2015
@@ -48,6 +48,7 @@ import org.apache.jackrabbit.oak.commons
import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
import org.apache.jackrabbit.oak.plugins.index.IndexEditorProvider;
import org.apache.jackrabbit.oak.plugins.index.aggregate.NodeAggregator;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
import org.apache.jackrabbit.oak.spi.commit.BackgroundObserver;
import org.apache.jackrabbit.oak.plugins.index.lucene.score.ScorerProviderFactory;
import org.apache.jackrabbit.oak.spi.commit.BackgroundObserverMBean;
@@ -144,6 +145,12 @@ public class LuceneIndexProviderService
@Reference
ScorerProviderFactory scorerFactory;
+ @Reference(policy = ReferencePolicy.DYNAMIC,
+ cardinality = ReferenceCardinality.OPTIONAL_MULTIPLE,
+ policyOption = ReferencePolicyOption.GREEDY
+ )
+ private volatile PreExtractedTextProvider extractedTextProvider;
+
private IndexCopier indexCopier;
private File indexDir;
@@ -152,6 +159,8 @@ public class LuceneIndexProviderService
private int threadPoolSize;
+ private ExtractedTextCache extractedTextCache = new ExtractedTextCache();
+
@Activate
private void activate(BundleContext bundleContext, Map<String, ?> config)
throws NotCompliantMBeanException, IOException {
@@ -231,12 +240,17 @@ public class LuceneIndexProviderService
LuceneIndexEditorProvider editorProvider;
if (enableCopyOnWrite){
initializeIndexCopier(bundleContext, config);
- editorProvider = new LuceneIndexEditorProvider(indexCopier);
+ editorProvider = new LuceneIndexEditorProvider(indexCopier, extractedTextCache);
log.info("Enabling CopyOnWrite support. Index files would be copied under {}", indexDir.getAbsolutePath());
} else {
- editorProvider = new LuceneIndexEditorProvider();
+ editorProvider = new LuceneIndexEditorProvider(null, extractedTextCache);
}
regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, null));
+ oakRegs.add(registerMBean(whiteboard,
+ TextExtractionStatsMBean.class,
+ editorProvider.getExtractedTextCache().getStatsMBean(),
+ TextExtractionStatsMBean.TYPE,
+ "TextExtraction statistics"));
}
private IndexTracker createTracker(BundleContext bundleContext, Map<String, ?> config) throws IOException {
@@ -359,6 +373,17 @@ public class LuceneIndexProviderService
TokenFilterFactory.reloadTokenFilters(classLoader);
}
+ private void registerExtractedTextProvider(PreExtractedTextProvider provider){
+ if (extractedTextCache != null){
+ if (provider != null){
+ log.info("Registering PreExtractedTextProvider {} with extracted text cache", provider);
+ } else {
+ log.info("Unregistering PreExtractedTextProvider with extracted text cache");
+ }
+ extractedTextCache.setExtractedTextProvider(provider);
+ }
+ }
+
protected void bindNodeAggregator(NodeAggregator aggregator) {
this.nodeAggregator = aggregator;
@@ -370,4 +395,14 @@ public class LuceneIndexProviderService
initialize();
}
+ protected void bindExtractedTextProvider(PreExtractedTextProvider preExtractedTextProvider){
+ this.extractedTextProvider = preExtractedTextProvider;
+ registerExtractedTextProvider(preExtractedTextProvider);
+ }
+
+ protected void unbindExtractedTextProvider(PreExtractedTextProvider preExtractedTextProvider){
+ this.extractedTextProvider = null;
+ registerExtractedTextProvider(null);
+ }
+
}
Added: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java?rev=1690637&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java Mon Jul 13 11:28:11 2015
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene;
+
+import aQute.bnd.annotation.ProviderType;
+
+@ProviderType
+public interface TextExtractionStatsMBean {
+ String TYPE = "TextExtractionStats";
+
+ boolean isPreExtractedTextProviderConfigured();
+
+ int getTextExtractionCount();
+
+ long getTotalTime();
+
+ int getPreFetchedCount();
+
+ String getExtractedTextSize();
+
+ String getBytesRead();
+}
Propchange: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java Mon Jul 13 11:28:11 2015
@@ -19,10 +19,14 @@
package org.apache.jackrabbit.oak.plugins.index.lucene;
+import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
+import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.plugins.index.IndexEditorProvider;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
import org.apache.jackrabbit.oak.spi.commit.BackgroundObserver;
import org.apache.jackrabbit.oak.spi.commit.Observer;
import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;
@@ -34,7 +38,6 @@ import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
@@ -126,9 +129,31 @@ public class LuceneIndexProviderServiceT
MockOsgi.deactivate(service);
}
+ @Test
+ public void preExtractedTextProvider() throws Exception{
+ MockOsgi.activate(service, context.bundleContext(), getDefaultConfig());
+ LuceneIndexEditorProvider editorProvider =
+ (LuceneIndexEditorProvider) context.getService(IndexEditorProvider.class);
+ assertNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+
+ //Mock OSGi does not support components
+ //context.registerService(PreExtractedTextProvider.class, new DummyProvider());
+ service.bindExtractedTextProvider(new DummyProvider());
+
+ assertNotNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+ }
+
private Map<String,Object> getDefaultConfig(){
Map<String,Object> config = new HashMap<String, Object>();
config.put("localIndexDir", folder.getRoot().getAbsolutePath());
return config;
}
+
+ private static class DummyProvider implements PreExtractedTextProvider {
+
+ @Override
+ public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
+ return null;
+ }
+ }
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Mon Jul 13 11:28:11 2015
@@ -25,6 +25,7 @@ import java.text.ParseException;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
+import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ExecutorService;
@@ -51,6 +52,9 @@ import org.apache.jackrabbit.oak.api.Res
import org.apache.jackrabbit.oak.api.Tree;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
@@ -93,6 +97,7 @@ import static org.hamcrest.CoreMatchers.
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
import static org.junit.matchers.JUnitMatchers.containsString;
public class LucenePropertyIndexTest extends AbstractQueryTest {
@@ -106,6 +111,8 @@ public class LucenePropertyIndexTest ext
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
+ private LuceneIndexEditorProvider editorProvider;
+
@Override
protected void createTestIndexNode() throws Exception {
setTraversalEnabled(false);
@@ -113,6 +120,7 @@ public class LucenePropertyIndexTest ext
@Override
protected ContentRepository createRepository() {
+ editorProvider = new LuceneIndexEditorProvider(createIndexCopier());
LuceneIndexProvider provider = new LuceneIndexProvider();
return new Oak()
.with(new InitialContent())
@@ -1286,6 +1294,41 @@ public class LucenePropertyIndexTest ext
}
@Test
+ public void preExtractedTextProvider() throws Exception{
+ Tree idx = createFulltextIndex(root.getTree("/"), "test");
+ TestUtil.useV2(idx);
+ root.commit();
+
+ AccessStateProvidingBlob testBlob =
+ new AccessStateProvidingBlob("fox is jumping", "id1");
+
+ MapBasedProvider textProvider = new MapBasedProvider();
+ textProvider.write("id1","lion");
+ editorProvider.getExtractedTextCache().setExtractedTextProvider(textProvider);
+
+ Tree test = root.getTree("/").addChild("test");
+ createFileNode(test, "text", testBlob, "text/plain");
+ root.commit();
+
+ //As its not a reindex case actual blob content would be accessed
+ assertTrue(testBlob.isStreamAccessed());
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'fox ')", asList("/test/text/jcr:content"));
+ assertEquals(0, textProvider.accessCount);
+
+ testBlob.resetState();
+
+ //Lets trigger a reindex
+ root.getTree(idx.getPath()).setProperty(IndexConstants.REINDEX_PROPERTY_NAME, true);
+ root.commit();
+
+ //Now the content should be provided by the PreExtractedTextProvider
+ //and instead of fox its lion!
+ assertFalse(testBlob.isStreamAccessed());
+ assertQuery("select * from [nt:base] where CONTAINS(*, 'lion ')", asList("/test/text/jcr:content"));
+ assertEquals(1, textProvider.accessCount);
+ }
+
+ @Test
public void maxFieldLengthCheck() throws Exception{
Tree idx = createFulltextIndex(root.getTree("/"), "test");
TestUtil.useV2(idx);
@@ -1654,6 +1697,7 @@ public class LucenePropertyIndexTest ext
private static class AccessStateProvidingBlob extends ArrayBasedBlob {
private CountingInputStream stream;
+ private String id;
public AccessStateProvidingBlob(byte[] value) {
super(value);
@@ -1663,6 +1707,11 @@ public class LucenePropertyIndexTest ext
this(content.getBytes(Charsets.UTF_8));
}
+ public AccessStateProvidingBlob(String content, String id) {
+ this(content.getBytes(Charsets.UTF_8));
+ this.id = id;
+ }
+
@Nonnull
@Override
public InputStream getNewStream() {
@@ -1684,5 +1733,32 @@ public class LucenePropertyIndexTest ext
}
return stream.getCount();
}
+
+ @Override
+ public String getContentIdentity() {
+ return id;
+ }
+ }
+
+ private static class MapBasedProvider implements PreExtractedTextProvider {
+ final Map<String, ExtractedText> idMap = Maps.newHashMap();
+ int accessCount = 0;
+
+ @Override
+ public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
+ ExtractedText result = idMap.get(blob.getContentIdentity());
+ if (result != null){
+ accessCount++;
+ }
+ return result;
+ }
+
+ public void write(String id, String text){
+ idMap.put(id, new ExtractedText(ExtractionResult.SUCCESS, text));
+ }
+
+ public void reset(){
+ accessCount = 0;
+ }
}
}