You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2015/07/14 12:12:46 UTC

svn commit: r1690897 - in /jackrabbit/oak/branches/1.2: ./ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/

Author: chetanm
Date: Tue Jul 14 10:12:45 2015
New Revision: 1690897

URL: http://svn.apache.org/r1690897
Log:
OAK-2892 - Speed up lucene indexing post migration by pre extracting the text content from binaries

Merging 1690637,1690650,1690885

Added:
    jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
      - copied unchanged from r1690637, jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ExtractedTextCache.java
    jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
      - copied unchanged from r1690637, jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/TextExtractionStatsMBean.java
Modified:
    jackrabbit/oak/branches/1.2/   (props changed)
    jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
    jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
    jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java
    jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
    jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
    jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java

Propchange: jackrabbit/oak/branches/1.2/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Tue Jul 14 10:12:45 2015
@@ -1,3 +1,3 @@
 /jackrabbit/oak/branches/1.0:1665962
-/jackrabbit/oak/trunk:1672350,1672468,1672537,1672603,1672642,1672644,1672834-1672835,1673351,1673410,1673414-1673415,1673436,1673644,1673662-1673664,1673669,1673695,1673738,1673787,1673791,1674046,1674065,1674075,1674107,1674228,1674780,1674880,1675054-1675055,1675319,1675332,1675354,1675357,1675382,1675555,1675566,1675593,1676198,1676237,1676407,1676458,1676539,1676670,1676693,1676703,1676725,1677579,1677581,1677609,1677611,1677774,1677788,1677797,1677804,1677806,1677939,1677991,1678023,1678095-1678096,1678171,1678173,1678211,1678323,1678758,1678938,1678954,1679144,1679165,1679191,1679232,1679235,1679503,1679958,1679961,1680170,1680182,1680222,1680232,1680236,1680461,1680633,1680643,1680747,1680805-1680806,1680903,1681282,1681767,1681918,1682042,1682218,1682235,1682437,1682494,1682555,1682855,1682904,1683059,1683089,1683213,1683249,1683259,1683278,1683323,1683687,1683700,1684174-1684175,1684186,1684376,1684442,1684561,1684570,1684601,1684618,1684820,1684868,1685023,1685370,1685552
 ,1685589-1685590,1685840,1685964,1685977,1685989,1685999,1686023,1686032,1686097,1686162,1686229,1686234,1686253,1686414,1686780,1686854,1686857,1686971,1687053-1687055,1687175,1687196,1687198,1687220,1687239-1687240,1687301,1687441,1687553,1688089-1688090,1688172,1688179,1688349,1688421,1688436,1688453,1688616,1688622,1688636,1688817,1689003-1689004,1689008,1689577,1689581,1689623,1689810,1689828,1689833,1689903,1690017,1690043,1690047,1690057,1690247,1690249,1690634-1690636,1690669,1690674
+/jackrabbit/oak/trunk:1672350,1672468,1672537,1672603,1672642,1672644,1672834-1672835,1673351,1673410,1673414-1673415,1673436,1673644,1673662-1673664,1673669,1673695,1673738,1673787,1673791,1674046,1674065,1674075,1674107,1674228,1674780,1674880,1675054-1675055,1675319,1675332,1675354,1675357,1675382,1675555,1675566,1675593,1676198,1676237,1676407,1676458,1676539,1676670,1676693,1676703,1676725,1677579,1677581,1677609,1677611,1677774,1677788,1677797,1677804,1677806,1677939,1677991,1678023,1678095-1678096,1678171,1678173,1678211,1678323,1678758,1678938,1678954,1679144,1679165,1679191,1679232,1679235,1679503,1679958,1679961,1680170,1680182,1680222,1680232,1680236,1680461,1680633,1680643,1680747,1680805-1680806,1680903,1681282,1681767,1681918,1682042,1682218,1682235,1682437,1682494,1682555,1682855,1682904,1683059,1683089,1683213,1683249,1683259,1683278,1683323,1683687,1683700,1684174-1684175,1684186,1684376,1684442,1684561,1684570,1684601,1684618,1684820,1684868,1685023,1685370,1685552
 ,1685589-1685590,1685840,1685964,1685977,1685989,1685999,1686023,1686032,1686097,1686162,1686229,1686234,1686253,1686414,1686780,1686854,1686857,1686971,1687053-1687055,1687175,1687196,1687198,1687220,1687239-1687240,1687301,1687441,1687553,1688089-1688090,1688172,1688179,1688349,1688421,1688436,1688453,1688616,1688622,1688636,1688817,1689003-1689004,1689008,1689577,1689581,1689623,1689810,1689828,1689833,1689903,1690017,1690043,1690047,1690057,1690247,1690249,1690634-1690637,1690650,1690669,1690674,1690885
 /jackrabbit/trunk:1345480

Modified: jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java?rev=1690897&r1=1690896&r2=1690897&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java (original)
+++ jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditor.java Tue Jul 14 10:12:45 2015
@@ -52,6 +52,8 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.plugins.index.IndexEditor;
 import org.apache.jackrabbit.oak.plugins.index.IndexUpdateCallback;
 import org.apache.jackrabbit.oak.plugins.index.PathFilter;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
 import org.apache.jackrabbit.oak.plugins.index.lucene.Aggregate.Matcher;
 import org.apache.jackrabbit.oak.plugins.memory.EmptyNodeState;
 import org.apache.jackrabbit.oak.plugins.tree.TreeFactory;
@@ -86,6 +88,7 @@ public class LuceneIndexEditor implement
 
     private static final Logger log =
             LoggerFactory.getLogger(LuceneIndexEditor.class);
+    static final String TEXT_EXTRACTION_ERROR = "TextExtractionError";
 
     private final LuceneIndexEditorContext context;
 
@@ -122,12 +125,14 @@ public class LuceneIndexEditor implement
     private final PathFilter.Result pathFilterResult;
 
     LuceneIndexEditor(NodeState root, NodeBuilder definition,
-        IndexUpdateCallback updateCallback,@Nullable IndexCopier indexCopier) throws CommitFailedException {
+                        IndexUpdateCallback updateCallback,
+                        @Nullable IndexCopier indexCopier,
+                        ExtractedTextCache extractedTextCache) throws CommitFailedException {
         this.parent = null;
         this.name = null;
         this.path = "/";
         this.context = new LuceneIndexEditorContext(root, definition,
-                updateCallback, indexCopier);
+                updateCallback, indexCopier, extractedTextCache);
         this.root = root;
         this.isDeleted = false;
         this.matcherState = MatcherState.NONE;
@@ -554,12 +559,16 @@ public class LuceneIndexEditor implement
         }
 
         for (Blob v : property.getValue(Type.BINARIES)) {
+            String value = parseStringValue(v, metadata, path, property.getName());
+            if (value == null){
+                continue;
+            }
+
             if (nodePath != null){
-                fields.add(newFulltextField(nodePath, parseStringValue(v, metadata, path)));
+                fields.add(newFulltextField(nodePath, value));
             } else {
-                fields.add(newFulltextField(parseStringValue(v, metadata, path)));
+                fields.add(newFulltextField(value));
             }
-
         }
         return fields;
     }
@@ -832,16 +841,24 @@ public class LuceneIndexEditor implement
         return context.isSupportedMediaType(type);
     }
 
-    private String parseStringValue(Blob v, Metadata metadata, String path) {
+    private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
+        String text = context.getExtractedTextCache().get(path, propertyName, v, context.isReindex());
+        if (text == null){
+            text = parseStringValue0(v, metadata, path);
+        }
+        return text;
+    }
+
+    private String parseStringValue0(Blob v, Metadata metadata, String path) {
         WriteOutContentHandler handler = new WriteOutContentHandler(context.getDefinition().getMaxExtractLength());
         long start = System.currentTimeMillis();
-        long size = 0;
+        long bytesRead = 0;
         try {
             CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
             try {
                 context.getParser().parse(stream, handler, metadata, new ParseContext());
             } finally {
-                size = stream.getCount();
+                bytesRead = stream.getCount();
                 stream.close();
             }
         } catch (LinkageError e) {
@@ -859,11 +876,15 @@ public class LuceneIndexEditor implement
                         + " worry about. The stack trace is included to"
                         + " help improve the text extraction feature.",
                         getIndexName(), path, t);
-                return "TextExtractionError";
+                context.getExtractedTextCache().put(v, ExtractedText.ERROR);
+                return TEXT_EXTRACTION_ERROR;
             }
         }
         String result = handler.toString();
-        context.recordTextExtractionStats(System.currentTimeMillis() - start, size);
+        if (bytesRead > 0) {
+            context.recordTextExtractionStats(System.currentTimeMillis() - start, bytesRead, result.length());
+        }
+        context.getExtractedTextCache().put(v,  new ExtractedText(ExtractionResult.SUCCESS, result));
         return result;
     }
 

Modified: jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java?rev=1690897&r1=1690896&r2=1690897&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java (original)
+++ jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorContext.java Tue Jul 14 10:12:45 2015
@@ -130,18 +130,20 @@ public class LuceneIndexEditorContext {
 
     private final TextExtractionStats textExtractionStats = new TextExtractionStats();
 
+    private final ExtractedTextCache extractedTextCache;
     /**
      * The media types supported by the parser used.
      */
     private Set<MediaType> supportedMediaTypes;
 
     LuceneIndexEditorContext(NodeState root, NodeBuilder definition, IndexUpdateCallback updateCallback,
-                             @Nullable IndexCopier indexCopier) {
+                             @Nullable IndexCopier indexCopier, ExtractedTextCache extractedTextCache) {
         this.definitionBuilder = definition;
         this.indexCopier = indexCopier;
         this.definition = new IndexDefinition(root, definition);
         this.indexedNodes = 0;
         this.updateCallback = updateCallback;
+        this.extractedTextCache = extractedTextCache;
         if (this.definition.isOfOldFormat()){
             IndexDefinition.updateDefinition(definition);
         }
@@ -200,6 +202,7 @@ public class LuceneIndexEditorContext {
             PERF_LOGGER.end(start, -1, "Closed IndexWriter for directory {}", definition);
 
             textExtractionStats.log(reindex);
+            textExtractionStats.collectStats(extractedTextCache);
         }
     }
 
@@ -269,8 +272,22 @@ public class LuceneIndexEditorContext {
         return definition;
     }
 
-    public void recordTextExtractionStats(long timeInMillis, long size) {
-        textExtractionStats.addStats(timeInMillis, size);
+    @Deprecated
+    public void recordTextExtractionStats(long timeInMillis, long bytesRead) {
+        //Keeping deprecated method to avoid major version change
+        recordTextExtractionStats(timeInMillis, bytesRead, 0);
+    }
+
+    public void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) {
+        textExtractionStats.addStats(timeInMillis, bytesRead, textLength);
+    }
+
+    ExtractedTextCache getExtractedTextCache() {
+        return extractedTextCache;
+    }
+
+    public boolean isReindex() {
+        return reindex;
     }
 
     private static Parser initializeTikaParser(IndexDefinition definition) {
@@ -324,15 +341,17 @@ public class LuceneIndexEditorContext {
         /**
          * Log stats only if time spent is more than 2 min
          */
-        private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(2);
+        private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(1);
         private int count;
-        private long totalSize;
+        private long totalBytesRead;
         private long totalTime;
+        private long totalTextLength;
 
-        public void addStats(long timeInMillis, long size) {
+        public void addStats(long timeInMillis, long bytesRead, int textLength) {
             count++;
-            totalSize += size;
+            totalBytesRead += bytesRead;
             totalTime += timeInMillis;
+            totalTextLength += textLength;
         }
 
         public void log(boolean reindex) {
@@ -343,6 +362,10 @@ public class LuceneIndexEditorContext {
             }
         }
 
+        public void collectStats(ExtractedTextCache cache){
+            cache.addStats(count, totalTime, totalBytesRead, totalTextLength);
+        }
+
         private boolean isTakingLotsOfTime() {
             return totalTime > LOGGING_THRESHOLD;
         }
@@ -353,8 +376,11 @@ public class LuceneIndexEditorContext {
 
         @Override
         public String toString() {
-            return String.format(" %d (%s, %s)", count,
-                    timeInWords(totalTime), humanReadableByteCount(totalSize));
+            return String.format(" %d (Time Taken %s, Bytes Read %s, Extracted text size %s)",
+                    count,
+                    timeInWords(totalTime),
+                    humanReadableByteCount(totalBytesRead),
+                    humanReadableByteCount(totalTextLength));
         }
 
         private static String timeInWords(long millis) {

Modified: jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java?rev=1690897&r1=1690896&r2=1690897&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java (original)
+++ jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexEditorProvider.java Tue Jul 14 10:12:45 2015
@@ -38,13 +38,20 @@ import org.apache.jackrabbit.oak.spi.sta
  */
 public class LuceneIndexEditorProvider implements IndexEditorProvider {
     private final IndexCopier indexCopier;
+    private final ExtractedTextCache extractedTextCache;
 
     public LuceneIndexEditorProvider() {
         this(null);
     }
 
     public LuceneIndexEditorProvider(@Nullable IndexCopier indexCopier) {
+        this(indexCopier, new ExtractedTextCache());
+    }
+
+    public LuceneIndexEditorProvider(@Nullable IndexCopier indexCopier,
+                                     ExtractedTextCache extractedTextCache) {
         this.indexCopier = indexCopier;
+        this.extractedTextCache = extractedTextCache;
     }
 
     @Override
@@ -53,7 +60,7 @@ public class LuceneIndexEditorProvider i
             @Nonnull IndexUpdateCallback callback)
             throws CommitFailedException {
         if (TYPE_LUCENE.equals(type)) {
-            return new LuceneIndexEditor(root, definition, callback, indexCopier);
+            return new LuceneIndexEditor(root, definition, callback, indexCopier, extractedTextCache);
         }
         return null;
     }
@@ -61,4 +68,8 @@ public class LuceneIndexEditorProvider i
     IndexCopier getIndexCopier() {
         return indexCopier;
     }
+
+    ExtractedTextCache getExtractedTextCache() {
+        return extractedTextCache;
+    }
 }

Modified: jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java?rev=1690897&r1=1690896&r2=1690897&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java (original)
+++ jackrabbit/oak/branches/1.2/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderService.java Tue Jul 14 10:12:45 2015
@@ -48,6 +48,7 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.osgi.OsgiWhiteboard;
 import org.apache.jackrabbit.oak.plugins.index.IndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.index.aggregate.NodeAggregator;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.spi.commit.BackgroundObserver;
 import org.apache.jackrabbit.oak.plugins.index.lucene.score.ScorerProviderFactory;
 import org.apache.jackrabbit.oak.spi.commit.BackgroundObserverMBean;
@@ -144,6 +145,12 @@ public class LuceneIndexProviderService
     @Reference
     ScorerProviderFactory scorerFactory;
 
+    @Reference(policy = ReferencePolicy.DYNAMIC,
+            cardinality = ReferenceCardinality.OPTIONAL_MULTIPLE,
+            policyOption = ReferencePolicyOption.GREEDY
+    )
+    private volatile PreExtractedTextProvider extractedTextProvider;
+
     private IndexCopier indexCopier;
 
     private File indexDir;
@@ -152,6 +159,8 @@ public class LuceneIndexProviderService
 
     private int threadPoolSize;
 
+    private ExtractedTextCache extractedTextCache = new ExtractedTextCache();
+
     @Activate
     private void activate(BundleContext bundleContext, Map<String, ?> config)
             throws NotCompliantMBeanException, IOException {
@@ -231,12 +240,17 @@ public class LuceneIndexProviderService
         LuceneIndexEditorProvider editorProvider;
         if (enableCopyOnWrite){
             initializeIndexCopier(bundleContext, config);
-            editorProvider = new LuceneIndexEditorProvider(indexCopier);
+            editorProvider = new LuceneIndexEditorProvider(indexCopier, extractedTextCache);
             log.info("Enabling CopyOnWrite support. Index files would be copied under {}", indexDir.getAbsolutePath());
         } else {
-            editorProvider = new LuceneIndexEditorProvider();
+            editorProvider = new LuceneIndexEditorProvider(null, extractedTextCache);
         }
         regs.add(bundleContext.registerService(IndexEditorProvider.class.getName(), editorProvider, null));
+        oakRegs.add(registerMBean(whiteboard,
+                TextExtractionStatsMBean.class,
+                editorProvider.getExtractedTextCache().getStatsMBean(),
+                TextExtractionStatsMBean.TYPE,
+                "TextExtraction statistics"));
     }
 
     private IndexTracker createTracker(BundleContext bundleContext, Map<String, ?> config) throws IOException {
@@ -359,6 +373,17 @@ public class LuceneIndexProviderService
         TokenFilterFactory.reloadTokenFilters(classLoader);
     }
 
+    private void registerExtractedTextProvider(PreExtractedTextProvider provider){
+        if (extractedTextCache != null){
+            if (provider != null){
+                log.info("Registering PreExtractedTextProvider {} with extracted text cache", provider);
+            } else {
+                log.info("Unregistering PreExtractedTextProvider with extracted text cache");
+            }
+            extractedTextCache.setExtractedTextProvider(provider);
+        }
+    }
+
 
     protected void bindNodeAggregator(NodeAggregator aggregator) {
         this.nodeAggregator = aggregator;
@@ -370,4 +395,14 @@ public class LuceneIndexProviderService
         initialize();
     }
 
+    protected void bindExtractedTextProvider(PreExtractedTextProvider preExtractedTextProvider){
+        this.extractedTextProvider = preExtractedTextProvider;
+        registerExtractedTextProvider(preExtractedTextProvider);
+    }
+
+    protected void unbindExtractedTextProvider(PreExtractedTextProvider preExtractedTextProvider){
+        this.extractedTextProvider = null;
+        registerExtractedTextProvider(null);
+    }
+
 }

Modified: jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java?rev=1690897&r1=1690896&r2=1690897&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java (original)
+++ jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexProviderServiceTest.java Tue Jul 14 10:12:45 2015
@@ -19,10 +19,14 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene;
 
+import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.plugins.index.IndexEditorProvider;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.spi.commit.BackgroundObserver;
 import org.apache.jackrabbit.oak.spi.commit.Observer;
 import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;
@@ -126,9 +130,31 @@ public class LuceneIndexProviderServiceT
         MockOsgi.deactivate(service);
     }
 
+    @Test
+    public void preExtractedTextProvider() throws Exception{
+        MockOsgi.activate(service, context.bundleContext(), getDefaultConfig());
+        LuceneIndexEditorProvider editorProvider =
+                (LuceneIndexEditorProvider) context.getService(IndexEditorProvider.class);
+        assertNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+
+        //Mock OSGi does not support components
+        //context.registerService(PreExtractedTextProvider.class, new DummyProvider());
+        service.bindExtractedTextProvider(new DummyProvider());
+
+        assertNotNull(editorProvider.getExtractedTextCache().getExtractedTextProvider());
+    }
+
     private Map<String,Object> getDefaultConfig(){
         Map<String,Object> config = new HashMap<String, Object>();
         config.put("localIndexDir", folder.getRoot().getAbsolutePath());
         return config;
     }
+
+    private static class DummyProvider implements PreExtractedTextProvider {
+
+        @Override
+        public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
+            return null;
+        }
+    }
 }

Modified: jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java?rev=1690897&r1=1690896&r2=1690897&view=diff
==============================================================================
--- jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java (original)
+++ jackrabbit/oak/branches/1.2/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/LucenePropertyIndexTest.java Tue Jul 14 10:12:45 2015
@@ -25,6 +25,7 @@ import java.text.ParseException;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 import java.util.Random;
 import java.util.Set;
 import java.util.concurrent.ExecutorService;
@@ -51,6 +52,9 @@ import org.apache.jackrabbit.oak.api.Res
 import org.apache.jackrabbit.oak.api.Tree;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.plugins.index.IndexConstants;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText.ExtractionResult;
+import org.apache.jackrabbit.oak.plugins.index.fulltext.PreExtractedTextProvider;
 import org.apache.jackrabbit.oak.plugins.index.nodetype.NodeTypeIndexProvider;
 import org.apache.jackrabbit.oak.plugins.index.property.PropertyIndexEditorProvider;
 import org.apache.jackrabbit.oak.plugins.memory.ArrayBasedBlob;
@@ -93,6 +97,7 @@ import static org.hamcrest.CoreMatchers.
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertThat;
+import static org.junit.Assert.assertTrue;
 import static org.junit.matchers.JUnitMatchers.containsString;
 
 public class LucenePropertyIndexTest extends AbstractQueryTest {
@@ -106,6 +111,8 @@ public class LucenePropertyIndexTest ext
     @Rule
     public TemporaryFolder temporaryFolder = new TemporaryFolder();
 
+    private LuceneIndexEditorProvider editorProvider;
+
     @Override
     protected void createTestIndexNode() throws Exception {
         setTraversalEnabled(false);
@@ -113,13 +120,14 @@ public class LucenePropertyIndexTest ext
 
     @Override
     protected ContentRepository createRepository() {
+        editorProvider = new LuceneIndexEditorProvider(createIndexCopier());
         LuceneIndexProvider provider = new LuceneIndexProvider();
         return new Oak()
                 .with(new InitialContent())
                 .with(new OpenSecurityProvider())
                 .with((QueryIndexProvider) provider)
                 .with((Observer) provider)
-                .with(new LuceneIndexEditorProvider(createIndexCopier()))
+                .with(editorProvider)
                 .with(new PropertyIndexEditorProvider())
                 .with(new NodeTypeIndexProvider())
                 .createContentRepository();
@@ -1286,6 +1294,41 @@ public class LucenePropertyIndexTest ext
     }
 
     @Test
+    public void preExtractedTextProvider() throws Exception{
+        Tree idx = createFulltextIndex(root.getTree("/"), "test");
+        TestUtil.useV2(idx);
+        root.commit();
+
+        AccessStateProvidingBlob testBlob =
+                new AccessStateProvidingBlob("fox is jumping", "id1");
+
+        MapBasedProvider textProvider = new MapBasedProvider();
+        textProvider.write("id1","lion");
+        editorProvider.getExtractedTextCache().setExtractedTextProvider(textProvider);
+
+        Tree test = root.getTree("/").addChild("test");
+        createFileNode(test, "text", testBlob, "text/plain");
+        root.commit();
+
+        //As its not a reindex case actual blob content would be accessed
+        assertTrue(testBlob.isStreamAccessed());
+        assertQuery("select * from [nt:base] where CONTAINS(*, 'fox ')", asList("/test/text/jcr:content"));
+        assertEquals(0, textProvider.accessCount);
+
+        testBlob.resetState();
+
+        //Lets trigger a reindex
+        root.getTree(idx.getPath()).setProperty(IndexConstants.REINDEX_PROPERTY_NAME, true);
+        root.commit();
+
+        //Now the content should be provided by the PreExtractedTextProvider
+        //and instead of fox its lion!
+        assertFalse(testBlob.isStreamAccessed());
+        assertQuery("select * from [nt:base] where CONTAINS(*, 'lion ')", asList("/test/text/jcr:content"));
+        assertEquals(1, textProvider.accessCount);
+    }
+
+    @Test
     public void maxFieldLengthCheck() throws Exception{
         Tree idx = createFulltextIndex(root.getTree("/"), "test");
         TestUtil.useV2(idx);
@@ -1568,6 +1611,7 @@ public class LucenePropertyIndexTest ext
 
     private static class AccessStateProvidingBlob extends ArrayBasedBlob {
         private CountingInputStream stream;
+        private String id;
 
         public AccessStateProvidingBlob(byte[] value) {
             super(value);
@@ -1577,6 +1621,11 @@ public class LucenePropertyIndexTest ext
             this(content.getBytes(Charsets.UTF_8));
         }
 
+        public AccessStateProvidingBlob(String content, String id) {
+            this(content.getBytes(Charsets.UTF_8));
+            this.id = id;
+        }
+
         @Nonnull
         @Override
         public InputStream getNewStream() {
@@ -1598,5 +1647,32 @@ public class LucenePropertyIndexTest ext
             }
             return stream.getCount();
         }
+
+        @Override
+        public String getContentIdentity() {
+            return id;
+        }
+    }
+
+    private static class MapBasedProvider implements PreExtractedTextProvider {
+        final Map<String, ExtractedText> idMap = Maps.newHashMap();
+        int accessCount = 0;
+
+        @Override
+        public ExtractedText getText(String propertyPath, Blob blob) throws IOException {
+            ExtractedText result = idMap.get(blob.getContentIdentity());
+            if (result != null){
+                accessCount++;
+            }
+            return result;
+        }
+
+        public void write(String id, String text){
+            idMap.put(id, new ExtractedText(ExtractionResult.SUCCESS, text));
+        }
+
+        public void reset(){
+            accessCount = 0;
+        }
     }
 }