You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2017/07/04 08:38:25 UTC

svn commit: r1800742 - in /jackrabbit/oak/trunk/oak-lucene/src: main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/ test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/

Author: chetanm
Date: Tue Jul  4 08:38:25 2017
New Revision: 1800742

URL: http://svn.apache.org/viewvc?rev=1800742&view=rev
Log:
OAK-6414 - Use Tika config to determine non indexed mimeTypes

Reverting changes in 1800726, 1800727

Removed:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/
Modified:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java?rev=1800742&r1=1800741&r2=1800742&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java Tue Jul  4 08:38:25 2017
@@ -19,16 +19,12 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
 
-import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.ArrayList;
-import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 
-import javax.annotation.Nullable;
-
 import com.google.common.io.CountingInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.JcrConstants;
@@ -43,7 +39,6 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.lucene.document.Field;
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -52,7 +47,6 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
 
 import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField;
@@ -67,12 +61,10 @@ public class BinaryTextExtractor {
     private final IndexDefinition definition;
     private final boolean reindex;
     private Parser parser;
-    private TikaConfigHolder tikaConfig;
     /**
      * The media types supported by the parser used.
      */
     private Set<MediaType> supportedMediaTypes;
-    private Set<MediaType> nonIndexedMediaType;
 
     public BinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition definition, boolean reindex) {
         this.extractedTextCache = extractedTextCache;
@@ -200,13 +192,6 @@ public class BinaryTextExtractor {
 
     //~-------------------------------------------< Tika >
 
-    public TikaConfig getTikaConfig(){
-        if (tikaConfig == null) {
-            tikaConfig = initializeTikaConfig(definition);
-        }
-        return tikaConfig.config;
-    }
-
     private Parser getParser() {
         if (parser == null){
             parser = initializeTikaParser(definition);
@@ -217,111 +202,58 @@ public class BinaryTextExtractor {
     private boolean isSupportedMediaType(String type) {
         if (supportedMediaTypes == null) {
             supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
-            nonIndexedMediaType = getNonIndexedMediaTypes();
-        }
-        MediaType mediaType = MediaType.parse(type);
-        return supportedMediaTypes.contains(mediaType) && !nonIndexedMediaType.contains(mediaType);
-    }
-
-    private Set<MediaType> getNonIndexedMediaTypes() {
-        InputStream configStream = null;
-        String configSource = null;
-        try {
-            if (definition.hasCustomTikaConfig()) {
-                configSource = String.format("Custom config at %s", definition.getIndexPath());
-                configStream = definition.getTikaConfig();
-            } else {
-                URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
-                configSource = "Default : tika-config.xml";
-                if (configUrl != null) {
-                    configStream = configUrl.openStream();
-                }
-            }
-
-            if (configStream != null) {
-                return TikaParserConfig.getNonIndexedMediaTypes(configStream);
-            }
-        } catch (TikaException | IOException | SAXException e) {
-            log.warn("Tika configuration not available : " + configSource, e);
-        } finally {
-            IOUtils.closeQuietly(configStream);
         }
-        return Collections.emptySet();
+        return supportedMediaTypes.contains(MediaType.parse(type));
     }
 
-
-    private static TikaConfigHolder initializeTikaConfig(@Nullable  IndexDefinition definition) {
+    private static Parser initializeTikaParser(IndexDefinition definition) {
         ClassLoader current = Thread.currentThread().getContextClassLoader();
-        InputStream configStream = null;
-        String configSource = null;
-
         try {
-            Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
-            if (definition != null && definition.hasCustomTikaConfig()) {
+            if (definition.hasCustomTikaConfig()) {
                 log.debug("[{}] Using custom tika config", definition.getIndexName());
-                configSource = "Custom config at " + definition.getIndexPath();
-                configStream = definition.getTikaConfig();
-            } else {
-                URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
-                if (configUrl != null) {
-                    configSource = configUrl.toString();
-                    configStream = configUrl.openStream();
+                Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
+                InputStream is = definition.getTikaConfig();
+                try {
+                    return new AutoDetectParser(getTikaConfig(is, definition));
+                } finally {
+                    IOUtils.closeQuietly(is);
                 }
             }
-
-            if (configStream != null) {
-                return new TikaConfigHolder(new TikaConfig(configStream), configSource);
-            }
-        } catch (TikaException | IOException | SAXException e) {
-            log.warn("Tika configuration not available : " + configSource, e);
-        } finally {
-            IOUtils.closeQuietly(configStream);
+        }finally {
             Thread.currentThread().setContextClassLoader(current);
         }
-        return new TikaConfigHolder(TikaConfig.getDefaultConfig(), "Default Config");
+        return defaultParser;
     }
 
-    private Parser initializeTikaParser(IndexDefinition definition) {
+    private static AutoDetectParser createDefaultParser() {
         ClassLoader current = Thread.currentThread().getContextClassLoader();
-        try {
-            if (definition.hasCustomTikaConfig()) {
+        URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
+        InputStream is = null;
+        if (configUrl != null) {
+            try {
                 Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
-                return new AutoDetectParser(getTikaConfig());
+                is = configUrl.openStream();
+                TikaConfig config = new TikaConfig(is);
+                log.info("Loaded default Tika Config from classpath {}", configUrl);
+                return new AutoDetectParser(config);
+            } catch (Exception e) {
+                log.warn("Tika configuration not available : " + configUrl, e);
+            } finally {
+                IOUtils.closeQuietly(is);
+                Thread.currentThread().setContextClassLoader(current);
             }
-        } finally {
-            Thread.currentThread().setContextClassLoader(current);
+        } else {
+            log.warn("Default Tika configuration not found");
         }
-        return defaultParser;
+        return new AutoDetectParser();
     }
 
-    private static AutoDetectParser createDefaultParser() {
-        ClassLoader current = Thread.currentThread().getContextClassLoader();
-        TikaConfigHolder configHolder = null;
+    private static TikaConfig getTikaConfig(InputStream configStream, Object source){
         try {
-            configHolder = initializeTikaConfig(null);
-            Thread.currentThread().setContextClassLoader(LuceneIndexEditorContext.class.getClassLoader());
-            log.info("Loaded default Tika Config from classpath {}", configHolder);
-            return new AutoDetectParser(configHolder.config);
+            return new TikaConfig(configStream);
         } catch (Exception e) {
-            log.warn("Tika configuration not available : " + configHolder, e);
-        } finally {
-            Thread.currentThread().setContextClassLoader(current);
-        }
-        return new AutoDetectParser();
-    }
-
-    private static final class TikaConfigHolder{
-        final TikaConfig config;
-        final String sourceInfo;
-
-        public TikaConfigHolder(TikaConfig config, String sourceInfo) {
-            this.config = config;
-            this.sourceInfo = sourceInfo;
-        }
-
-        @Override
-        public String toString() {
-            return sourceInfo;
+            log.warn("Tika configuration not available : "+source, e);
         }
+        return TikaConfig.getDefaultConfig();
     }
 }