You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/13 13:28:11 UTC

svn commit: r1690637 - in /jackrabbit/oak/trunk/oak-lucene/src: main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ test/java/org/apache/jackrabbit/oak/plugins/index/lucene/

Author: chetanm
Date: Mon Jul 13 11:28:11 2015
New Revision: 1690637

URL: http://svn.apache.org/r1690637
Log:
OAK-2892 - Speed up lucene indexing post migration by pre extracting the text content from binaries

Make use of PreExtractedTextProvider in oak-lucene
-- Exposed an MBean to provide some stats around text extraction
-- Wire up PreExtractedTextProvider via OSGi

Added:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java   (with props)
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java   (with props)
Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Added: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java?rev=1690637&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java Mon Jul 13 11:28:11 2015
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene;
+
+import java.io.IOException;
+
+import javax.annotation.CheckForNull;
+
+import org.apache.jackrabbit.oak.api.Blob;
+import org.apache.jackrabbit.oak.commons.IOUtils;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static org.apache.jackrabbit.oak.commons.PathUtils.concat;
+
+class ExtractedTextCache {
+    private static final String EMPTY_STRING = "";
+    private final Logger log = LoggerFactory.getLogger(getClass());
+    private volatile PreExtractedTextProvider extractedTextProvider;
+    private int textExtractionCount;
+    private long totalBytesRead;
+    private long totalTextSize;
+    private long totalTime;
+    private int preFetchedCount;
+
+    /**
+     * Get the pre extracted text for given blob
+     * @return null if no pre extracted text entry found. Otherwise returns the pre extracted
+     *  text
+     */
+    @CheckForNull
+    public String get(String nodePath, String propertyName, Blob blob, boolean reindexMode){
+        String result = null;
+        //Consult the PreExtractedTextProvider only in reindex mode and not in
+        //incremental indexing mode. As that would only contain older entries
+        //That also avoid loading on various state (See DataStoreTextWriter)
+        if (reindexMode && extractedTextProvider != null){
+            String propertyPath = concat(nodePath, propertyName);
+            try {
+                ExtractedText text = extractedTextProvider.getText(propertyPath, blob);
+                if (text != null) {
+                    preFetchedCount++;
+                    switch (text.getExtractionResult()) {
+                        case SUCCESS:
+                            result = text.getExtractedText().toString();
+                            break;
+                        case ERROR:
+                            result = LuceneIndexEditor.TEXT_EXTRACTION_ERROR;
+                            break;
+                        case EMPTY:
+                            result = EMPTY_STRING;
+                            break;
+                    }
+                }
+            } catch (IOException e) {
+                log.warn("Error occurred while fetching pre extracted text for {}", propertyPath, e);
+            }
+        }
+        return result;
+    }
+
+    public void put(Blob blob, ExtractedText extractedText){
+
+    }
+
+    public void addStats(int count, long timeInMillis, long bytesRead, long textLength){
+        this.textExtractionCount += count;
+        this.totalTime += timeInMillis;
+        this.totalBytesRead += bytesRead;
+        this.totalTextSize += textLength;
+    }
+
+    public TextExtractionStatsMBean getStatsMBean(){
+        return new TextExtractionStatsMBean() {
+            @Override
+            public boolean isPreExtractedTextProviderConfigured() {
+                return extractedTextProvider != null;
+            }
+
+            @Override
+            public int getTextExtractionCount() {
+                return textExtractionCount;
+            }
+
+            @Override
+            public long getTotalTime() {
+                return totalTime;
+            }
+
+            @Override
+            public int getPreFetchedCount() {
+                return preFetchedCount;
+            }
+
+            @Override
+            public String getExtractedTextSize() {
+                return IOUtils.humanReadableByteCount(totalTextSize);
+            }
+
+            @Override
+            public String getBytesRead() {
+                return IOUtils.humanReadableByteCount(totalBytesRead);
+            }
+        };
+    }
+
+    public void setExtractedTextProvider(PreExtractedTextProvider extractedTextProvider) {
+        this.extractedTextProvider = extractedTextProvider;
+    }
+
+    public PreExtractedTextProvider getExtractedTextProvider() {
+        return extractedTextProvider;
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java Mon Jul 13 11:28:11 2015
@@ -52,6 +52,8 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.plugins.index.IndexEditor;
 import org.apache.jackrabbit.oak.plugins.index.IndexUpdateCallback;
 import org.apache.jackrabbit.oak.plugins.index.PathFilter;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
 import org.apache.jackrabbit.oak.plugins.index.lucene.Aggregate.Matcher;
 import org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState;
 import org.apache.jackrabbit.oak.plugins.tree.TreeFactory;
@@ -86,6 +88,7 @@ public class LuceneIndexEditor implement
 
     private static final Logger log =
             LoggerFactory.getLogger(LuceneIndexEditor.class);
+    static final String TEXT_EXTRACTION_ERROR = "TextExtractionError";
 
     private final LuceneIndexEditorContext context;
 
@@ -122,12 +125,14 @@ public class LuceneIndexEditor implement
     private final PathFilter.Result pathFilterResult;
 
     LuceneIndexEditor(NodeState root, NodeBuilder definition,
-        IndexUpdateCallback updateCallback,@Nullable IndexCopier indexCopier) throws CommitFailedException {
+                        IndexUpdateCallback updateCallback,
+                        @Nullable IndexCopier indexCopier,
+                        ExtractedTextCache extractedTextCache) throws CommitFailedException {
         this.parent = null;
         this.name = null;
         this.path = "/";
         this.context = new LuceneIndexEditorContext(root, definition,
-                updateCallback, indexCopier);
+                updateCallback, indexCopier, extractedTextCache);
         this.root = root;
         this.isDeleted = false;
         this.matcherState = MatcherState.NONE;
@@ -554,12 +559,16 @@ public class LuceneIndexEditor implement
         }
 
         for (Blob v : property.getValue(Type.BINARIES)) {
+            String value = parseStringValue(v, metadata, path, property.getName());
+            if (value == null){
+                continue;
+            }
+
             if (nodePath != null){
-                fields.add(newFulltextField(nodePath, parseStringValue(v, metadata, path)));
+                fields.add(newFulltextField(nodePath, value));
             } else {
-                fields.add(newFulltextField(parseStringValue(v, metadata, path)));
+                fields.add(newFulltextField(value));
             }
-
         }
         return fields;
     }
@@ -832,16 +841,24 @@ public class LuceneIndexEditor implement
         return context.isSupportedMediaType(type);
     }
 
-    private String parseStringValue(Blob v, Metadata metadata, String path) {
+    private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
+        String text = context.getExtractedTextCache().get(path, propertyName, v, context.isReindex());
+        if (text == null){
+            text = parseStringValue0(v, metadata, path);
+        }
+        return text;
+    }
+
+    private String parseStringValue0(Blob v, Metadata metadata, String path) {
         WriteOutContentHandler handler = new WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
         long start = System.currentTimeMillis();
-        long size = 0;
+        long bytesRead = 0;
         try {
             CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
             try {
                 context.getParser().parse(stream, handler, metadata, new ParseContext());
             } finally {
-                size = stream.getCount();
+                bytesRead = stream.getCount();
                 stream.close();
             }
         } catch (LinkageError e) {
@@ -859,11 +876,15 @@ public class LuceneIndexEditor implement
                         + " worry about. The stack trace is included to"
                         + " help improve the text extraction feature.",
                         getIndexName(), path, t);
-                return "TextExtractionError";
+                context.getExtractedTextCache().put(v, ExtractedText.ERROR);
+                return TEXT_EXTRACTION_ERROR;
             }
         }
         String result = handler.toString();
-        context.recordTextExtractionStats(System.currentTimeMillis() - start, size);
+        if (bytesRead > 0) {
+            context.recordTextExtractionStats(System.currentTimeMillis() - start, bytesRead, result.length());
+        }
+        context.getExtractedTextCache().put(v,  new ExtractedText(ExtractionResult.SUCCESS, result));
         return result;
     }
 

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java Mon Jul 13 11:28:11 2015
@@ -131,18 +131,20 @@ public class LuceneIndexEditorContext {
 
     private final TextExtractionStats textExtractionStats = new TextExtractionStats();
 
+    private final ExtractedTextCache extractedTextCache;
     /**
      * The media types supported by the parser used.
      */
     private Set<MediaType> supportedMediaTypes;
 
     LuceneIndexEditorContext(NodeState root, NodeBuilder definition, IndexUpdateCallback updateCallback,
-                             @Nullable IndexCopier indexCopier) {
+                             @Nullable IndexCopier indexCopier, ExtractedTextCache extractedTextCache) {
         this.definitionBuilder = definition;
         this.indexCopier = indexCopier;
         this.definition = new IndexDefinition(root, definition);
         this.indexedNodes = 0;
         this.updateCallback = updateCallback;
+        this.extractedTextCache = extractedTextCache;
         if (this.definition.isOfOldFormat()){
             IndexDefinition.updateDefinition(definition);
         }
@@ -201,6 +203,7 @@ public class LuceneIndexEditorContext {
             PERF_LOGGER.end(start, -1, "Closed IndexWriter for directory {}", definition);
 
             textExtractionStats.log(reindex);
+            textExtractionStats.collectStats(extractedTextCache);
         }
     }
 
@@ -270,8 +273,22 @@ public class LuceneIndexEditorContext {
         return definition;
     }
 
-    public void recordTextExtractionStats(long timeInMillis, long size) {
-        textExtractionStats.addStats(timeInMillis, size);
+    @Deprecated
+    public void recordTextExtractionStats(long timeInMillis, long bytesRead) {
+        //Keeping deprecated method to avoid major version change
+        recordTextExtractionStats(timeInMillis, bytesRead, 0);
+    }
+
+    public void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) {
+        textExtractionStats.addStats(timeInMillis, bytesRead, textLength);
+    }
+
+    ExtractedTextCache getExtractedTextCache() {
+        return extractedTextCache;
+    }
+
+    public boolean isReindex() {
+        return reindex;
     }
 
     private static Parser initializeTikaParser(IndexDefinition definition) {
@@ -330,13 +347,15 @@ public class LuceneIndexEditorContext {
          */
         private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(2);
         private int count;
-        private long totalSize;
+        private long totalBytesRead;
         private long totalTime;
+        private long totalTextLength;
 
-        public void addStats(long timeInMillis, long size) {
+        public void addStats(long timeInMillis, long bytesRead, int textLength) {
             count++;
-            totalSize += size;
+            totalBytesRead += bytesRead;
             totalTime += timeInMillis;
+            totalTextLength += textLength;
         }
 
         public void log(boolean reindex) {
@@ -347,6 +366,10 @@ public class LuceneIndexEditorContext {
             }
         }
 
+        public void collectStats(ExtractedTextCache cache){
+            cache.addStats(count, totalTime, totalBytesRead, totalTextLength);
+        }
+
         private boolean isTakingLotsOfTime() {
             return totalTime > LOGGING_THRESHOLD;
         }
@@ -357,8 +380,8 @@ public class LuceneIndexEditorContext {
 
         @Override
         public String toString() {
-            return String.format(" %d (%s, %s)", count,
-                    timeInWords(totalTime), humanReadableByteCount(totalSize));
+            return String.format(" %d (Time Taken %s, Bytes Read %s, Extracted text size %d)", count,
+                    timeInWords(totalTime), humanReadableByteCount(totalBytesRead), totalTextLength);
         }
 
         private static String timeInWords(long millis) {

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java Mon Jul 13 11:28:11 2015
@@ -38,13 +38,20 @@ import org.apache.jackrabbit.oak.spi.sta
  */
 public class LuceneIndexEditorProvider implements IndexEditorProvider {
     private final IndexCopier indexCopier;
+    private final ExtractedTextCache extractedTextCache;
 
     public LuceneIndexEditorProvider() {
         this(null);
     }
 
     public LuceneIndexEditorProvider(@Nullable IndexCopier indexCopier) {
+        this(indexCopier, new ExtractedTextCache());
+    }
+
+    public LuceneIndexEditorProvider(@Nullable IndexCopier indexCopier,
+                                     ExtractedTextCache extractedTextCache) {
         this.indexCopier = indexCopier;
+        this.extractedTextCache = extractedTextCache;
     }
 
     @Override
@@ -53,7 +60,7 @@ public class LuceneIndexEditorProvider i
             @Nonnull IndexUpdateCallback callback)
             throws CommitFailedException {
         if (TYPE_LUCENE.equals(type)) {
-            return new LuceneIndexEditor(root, definition, callback, indexCopier);
+            return new LuceneIndexEditor(root, definition, callback, indexCopier, extractedTextCache);
         }
         return null;
     }
@@ -61,4 +68,8 @@ public class LuceneIndexEditorProvider i
     IndexCopier getIndexCopier() {
         return indexCopier;
     }
+
+    ExtractedTextCache getExtractedTextCache() {
+        return extractedTextCache;
+    }
 }

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java Mon Jul 13 11:28:11 2015
@@ -48,6 +48,7 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
 import org.apache.jackrabbit.oak.plugins.index.IndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.index.aggregate.NodeAggregator;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.spi.commit.BackgroundObserver;
 import org.apache.jackrabbit.oak.plugins.index.lucene.score.ScorerProviderFactory;
 import org.apache.jackrabbit.oak.spi.commit.BackgroundObserverMBean;
@@ -144,6 +145,12 @@ public class LuceneIndexProviderService
     @Reference
     ScorerProviderFactory scorerFactory;
 
+    @Reference(policy = ReferencePolicy.DYNAMIC,
+            cardinality = ReferenceCardinality.OPTIONAL_MULTIPLE,
+            policyOption = ReferencePolicyOption.GREEDY
+    )
+    private volatile PreExtractedTextProvider extractedTextProvider;
+
     private IndexCopier indexCopier;
 
     private File indexDir;
@@ -152,6 +159,8 @@ public class LuceneIndexProviderService
 
     private int threadPoolSize;
 
+    private ExtractedTextCache extractedTextCache = new ExtractedTextCache();
+
     @Activate
     private void activate(BundleContext bundleContext, Map<String, ?> config)
             throws NotCompliantMBeanException, IOException {
@@ -231,12 +240,17 @@ public class LuceneIndexProviderService
         LuceneIndexEditorProvider editorProvider;
         if (enableCopyOnWrite){
             initializeIndexCopier(bundleContext, config);
-            editorProvider = new LuceneIndexEditorProvider(indexCopier);
+            editorProvider = new LuceneIndexEditorProvider(indexCopier, extractedTextCache);
             log.info("Enabling CopyOnWrite support. Index files would be copied under {}", indexDir.getAbsolutePath());
         } else {
-            editorProvider = new LuceneIndexEditorProvider();
+            editorProvider = new LuceneIndexEditorProvider(null, extractedTextCache);
         }
         regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, null));
+        oakRegs.add(registerMBean(whiteboard,
+                TextExtractionStatsMBean.class,
+                editorProvider.getExtractedTextCache().getStatsMBean(),
+                TextExtractionStatsMBean.TYPE,
+                "TextExtraction statistics"));
     }
 
     private IndexTracker createTracker(BundleContext bundleContext, Map<String, ?> config) throws IOException {
@@ -359,6 +373,17 @@ public class LuceneIndexProviderService
         TokenFilterFactory.reloadTokenFilters(classLoader);
     }
 
+    private void registerExtractedTextProvider(PreExtractedTextProvider provider){
+        if (extractedTextCache != null){
+            if (provider != null){
+                log.info("Registering PreExtractedTextProvider {} with extracted text cache", provider);
+            } else {
+                log.info("Unregistering PreExtractedTextProvider with extracted text cache");
+            }
+            extractedTextCache.setExtractedTextProvider(provider);
+        }
+    }
+
 
     protected void bindNodeAggregator(NodeAggregator aggregator) {
         this.nodeAggregator = aggregator;
@@ -370,4 +395,14 @@ public class LuceneIndexProviderService
         initialize();
     }
 
+    protected void bindExtractedTextProvider(PreExtractedTextProvider preExtractedTextProvider){
+        this.extractedTextProvider = preExtractedTextProvider;
+        registerExtractedTextProvider(preExtractedTextProvider);
+    }
+
+    protected void unbindExtractedTextProvider(PreExtractedTextProvider preExtractedTextProvider){
+        this.extractedTextProvider = null;
+        registerExtractedTextProvider(null);
+    }
+
 }

Added: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java?rev=1690637&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java Mon Jul 13 11:28:11 2015
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene;
+
+import aQute.bnd.annotation.ProviderType;
+
+@ProviderType
+public interface TextExtractionStatsMBean {
+    String TYPE = "TextExtractionStats";
+
+    boolean isPreExtractedTextProviderConfigured();
+
+    int getTextExtractionCount();
+
+    long getTotalTime();
+
+    int getPreFetchedCount();
+
+    String getExtractedTextSize();
+
+    String getBytesRead();
+}

Propchange: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java Mon Jul 13 11:28:11 2015
@@ -19,10 +19,14 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.plugins.index.IndexEditorProvider;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.spi.commit.BackgroundObserver;
 import org.apache.jackrabbit.oak.spi.commit.Observer;
 import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;
@@ -34,7 +38,6 @@ import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 
 import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
@@ -126,9 +129,31 @@ public class LuceneIndexProviderServiceT
         MockOsgi.deactivate(service);
     }
 
+    @Test
+    public void preExtractedTextProvider() throws Exception{
+        MockOsgi.activate(service, context.bundleContext(), getDefaultConfig());
+        LuceneIndexEditorProvider editorProvider =
+                (LuceneIndexEditorProvider) context.getService(IndexEditorProvider.class);
+        assertNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+
+        //Mock OSGi does not support components
+        //context.registerService(PreExtractedTextProvider.class, new DummyProvider());
+        service.bindExtractedTextProvider(new DummyProvider());
+
+        assertNotNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+    }
+
     private Map<String,Object> getDefaultConfig(){
         Map<String,Object> config = new HashMap<String, Object>();
         config.put("localIndexDir", folder.getRoot().getAbsolutePath());
         return config;
     }
+
+    private static class DummyProvider implements PreExtractedTextProvider {
+
+        @Override
+        public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
+            return null;
+        }
+    }
 }

Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1690637&r1=1690636&r2=1690637&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Mon Jul 13 11:28:11 2015
@@ -25,6 +25,7 @@ import java.text.ParseException;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
@@ -51,6 +52,9 @@ import org.apache.jackrabbit.oak.api.Res
 import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
 import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
@@ -93,6 +97,7 @@ import static org.hamcrest.CoreMatchers.
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
 import static org.junit.matchers.JUnitMatchers.containsString;
 
 public class LucenePropertyIndexTest extends AbstractQueryTest {
@@ -106,6 +111,8 @@ public class LucenePropertyIndexTest ext
     @Rule
     public TemporaryFolder temporaryFolder = new TemporaryFolder();
 
+    private LuceneIndexEditorProvider editorProvider;
+
     @Override
     protected void createTestIndexNode() throws Exception {
         setTraversalEnabled(false);
@@ -113,6 +120,7 @@ public class LucenePropertyIndexTest ext
 
     @Override
     protected ContentRepository createRepository() {
+        editorProvider = new LuceneIndexEditorProvider(createIndexCopier());
         LuceneIndexProvider provider = new LuceneIndexProvider();
         return new Oak()
                 .with(new InitialContent())
@@ -1286,6 +1294,41 @@ public class LucenePropertyIndexTest ext
     }
 
     @Test
+    public void preExtractedTextProvider() throws Exception{
+        Tree idx = createFulltextIndex(root.getTree("/"), "test");
+        TestUtil.useV2(idx);
+        root.commit();
+
+        AccessStateProvidingBlob testBlob =
+                new AccessStateProvidingBlob("fox is jumping", "id1");
+
+        MapBasedProvider textProvider = new MapBasedProvider();
+        textProvider.write("id1","lion");
+        editorProvider.getExtractedTextCache().setExtractedTextProvider(textProvider);
+
+        Tree test = root.getTree("/").addChild("test");
+        createFileNode(test, "text", testBlob, "text/plain");
+        root.commit();
+
+        //As its not a reindex case actual blob content would be accessed
+        assertTrue(testBlob.isStreamAccessed());
+        assertQuery("select * from [nt:base] where CONTAINS(*, 'fox ')", asList("/test/text/jcr:content"));
+        assertEquals(0, textProvider.accessCount);
+
+        testBlob.resetState();
+
+        //Lets trigger a reindex
+        root.getTree(idx.getPath()).setProperty(IndexConstants.REINDEX_PROPERTY_NAME, true);
+        root.commit();
+
+        //Now the content should be provided by the PreExtractedTextProvider
+        //and instead of fox its lion!
+        assertFalse(testBlob.isStreamAccessed());
+        assertQuery("select * from [nt:base] where CONTAINS(*, 'lion ')", asList("/test/text/jcr:content"));
+        assertEquals(1, textProvider.accessCount);
+    }
+
+    @Test
     public void maxFieldLengthCheck() throws Exception{
         Tree idx = createFulltextIndex(root.getTree("/"), "test");
         TestUtil.useV2(idx);
@@ -1654,6 +1697,7 @@ public class LucenePropertyIndexTest ext
 
     private static class AccessStateProvidingBlob extends ArrayBasedBlob {
         private CountingInputStream stream;
+        private String id;
 
         public AccessStateProvidingBlob(byte[] value) {
             super(value);
@@ -1663,6 +1707,11 @@ public class LucenePropertyIndexTest ext
             this(content.getBytes(Charsets.UTF_8));
         }
 
+        public AccessStateProvidingBlob(String content, String id) {
+            this(content.getBytes(Charsets.UTF_8));
+            this.id = id;
+        }
+
         @Nonnull
         @Override
         public InputStream getNewStream() {
@@ -1684,5 +1733,32 @@ public class LucenePropertyIndexTest ext
             }
             return stream.getCount();
         }
+
+        @Override
+        public String getContentIdentity() {
+            return id;
+        }
+    }
+
+    private static class MapBasedProvider implements PreExtractedTextProvider {
+        final Map<String, ExtractedText> idMap = Maps.newHashMap();
+        int accessCount = 0;
+
+        @Override
+        public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
+            ExtractedText result = idMap.get(blob.getContentIdentity());
+            if (result != null){
+                accessCount++;
+            }
+            return result;
+        }
+
+        public void write(String id, String text){
+            idMap.put(id, new ExtractedText(ExtractionResult.SUCCESS, text));
+        }
+
+        public void reset(){
+            accessCount = 0;
+        }
     }
 }