You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2017/07/04 10:31:11 UTC

svn commit: r1800749 - in /jackrabbit/oak/trunk: oak-examples/standalone/ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/ oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/ oak-parent/

Author: chetanm
Date: Tue Jul  4 10:31:11 2017
New Revision: 1800749

URL: http://svn.apache.org/viewvc?rev=1800749&view=rev
Log:
OAK-5048 - Upgrade to Tika 1.15 version
OAK-6414 - Use Tika config to determine non indexed mimeTypes

-- Update Tika to 1.15
-- Use TikaParserConfig to check which all mimetypes have been configured
   with EmptyParser

-- oak-webapp - Need to exclude the httpcomponents from tika-parser
   as it has a transitive dependency to an old version of http components
   which is in conflict with one used by htmlunit.

Added:
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java   (with props)
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java   (with props)
Modified:
    jackrabbit/oak/trunk/oak-examples/standalone/pom.xml
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
    jackrabbit/oak/trunk/oak-parent/pom.xml

Modified: jackrabbit/oak/trunk/oak-examples/standalone/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-examples/standalone/pom.xml?rev=1800749&r1=1800748&r2=1800749&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-examples/standalone/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-examples/standalone/pom.xml Tue Jul  4 10:31:11 2017
@@ -133,6 +133,10 @@
           <groupId>commons-logging</groupId>
           <artifactId>commons-logging</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.httpcomponents</groupId>
+          <artifactId>httpcore</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java?rev=1800749&r1=1800748&r2=1800749&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java Tue Jul  4 10:31:11 2017
@@ -19,9 +19,11 @@
 
 package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
 
+import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 
@@ -39,6 +41,7 @@ import org.apache.jackrabbit.oak.plugins
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.lucene.document.Field;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -47,6 +50,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
 
 import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
 import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField;
@@ -65,6 +69,7 @@ public class BinaryTextExtractor {
      * The media types supported by the parser used.
      */
     private Set<MediaType> supportedMediaTypes;
+    private Set<MediaType> nonIndexedMediaType;
 
     public BinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition definition, boolean reindex) {
         this.extractedTextCache = extractedTextCache;
@@ -202,8 +207,36 @@ public class BinaryTextExtractor {
     private boolean isSupportedMediaType(String type) {
         if (supportedMediaTypes == null) {
             supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
+            nonIndexedMediaType = getNonIndexedMediaTypes();
         }
-        return supportedMediaTypes.contains(MediaType.parse(type));
+        MediaType mediaType = MediaType.parse(type);
+        return supportedMediaTypes.contains(mediaType) && !nonIndexedMediaType.contains(mediaType);
+    }
+
+    private Set<MediaType> getNonIndexedMediaTypes() {
+        InputStream configStream = null;
+        String configSource = null;
+        try {
+            if (definition.hasCustomTikaConfig()) {
+                configSource = String.format("Custom config at %s", definition.getIndexPath());
+                configStream = definition.getTikaConfig();
+            } else {
+                URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
+                configSource = "Default : tika-config.xml";
+                if (configUrl != null) {
+                    configStream = configUrl.openStream();
+                }
+            }
+
+            if (configStream != null) {
+                return TikaParserConfig.getNonIndexedMediaTypes(configStream);
+            }
+        } catch (TikaException | IOException | SAXException e) {
+            log.warn("Tika configuration not available : " + configSource, e);
+        } finally {
+            IOUtils.closeQuietly(configStream);
+        }
+        return Collections.emptySet();
     }
 
     private static Parser initializeTikaParser(IndexDefinition definition) {

Added: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java?rev=1800749&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java Tue Jul  4 10:31:11 2017
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+
+import com.google.common.base.Strings;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+public class TikaParserConfig {
+    private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser";
+
+    /**
+     * Determines the set of MediaType which have been configured with an EmptyParser.
+     *
+     * @param configStream stream for tika config
+     * @return set of MediaTypes which are not indexed
+     */
+    public static Set<MediaType> getNonIndexedMediaTypes(InputStream configStream) throws
+            TikaException, IOException, SAXException {
+        Set<MediaType> result = new HashSet<>();
+        Element element = getBuilder().parse(configStream).getDocumentElement();
+        NodeList nodes = element.getElementsByTagName("parsers");
+        if (nodes.getLength() == 1) {
+            Node parentNode = nodes.item(0);
+            NodeList parsersNodes = parentNode.getChildNodes();
+            for (int i = 0; i < parsersNodes.getLength(); i++) {
+                Node node = parsersNodes.item(i);
+                if (node instanceof Element) {
+                    String className = ((Element) node).getAttribute("class");
+                    if (EMPTY_PARSER.equals(className)) {
+                        NodeList mimes = ((Element) node).getElementsByTagName("mime");
+                        parseMimeTypes(result, mimes);
+                    }
+                }
+            }
+        }
+        return result;
+    }
+
+
+    private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) {
+        /*
+        <parser class="org.apache.tika.parser.EmptyParser">
+            <mime>application/x-archive</mime>
+            <mime>application/x-bzip</mime>
+            <mime>application/x-bzip2</mime>
+        </parser>
+        */
+        for (int j = 0; j < mimes.getLength(); j++) {
+            Node mime = mimes.item(j);
+            if (mime instanceof Element) {
+                String mimeValue = mime.getTextContent();
+                mimeValue = Strings.emptyToNull(mimeValue);
+                if (mimeValue != null) {
+                    MediaType mediaType = MediaType.parse(mimeValue.trim());
+                    if (mediaType != null) {
+                        result.add(mediaType);
+                    }
+                }
+            }
+        }
+    }
+
+    private static DocumentBuilder getBuilder() throws TikaException {
+        return new ParseContext().getDocumentBuilder();
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java?rev=1800749&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java Tue Jul  4 10:31:11 2017
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
+
+import java.io.StringReader;
+import java.util.Set;
+
+import org.apache.commons.io.input.ReaderInputStream;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class TikaParserConfigTest {
+
+    @Test
+    public void emptyParser() throws Exception{
+        String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+                "<properties>\n" +
+                "  <detectors>\n" +
+                "    <detector class=\"org.apache.tika.detect.TypeDetector\"/>\n" +
+                "  </detectors>\n" +
+                "  <parsers>\n" +
+                "    <parser class=\"org.apache.tika.parser.DefaultParser\"/>\n" +
+                "    <parser class=\"org.apache.tika.parser.EmptyParser\">\n" +
+                "      <mime>application/x-archive</mime>\n" +
+                "      <mime>application/x-bzip</mime>\n" +
+                "    </parser>\n" +
+                "  </parsers>\n" +
+                "</properties>";
+
+        Set<MediaType> types = TikaParserConfig.getNonIndexedMediaTypes(
+                new ReaderInputStream(new StringReader(xml), "UTF-8"));
+        assertEquals(2, types.size());
+        assertTrue(types.contains(MediaType.parse("application/x-archive")));
+    }
+
+}
\ No newline at end of file

Propchange: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-parent/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-parent/pom.xml?rev=1800749&r1=1800748&r2=1800749&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-parent/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-parent/pom.xml Tue Jul  4 10:31:11 2017
@@ -61,7 +61,7 @@
     <slf4j.version>1.7.6</slf4j.version> <!-- sync with logback version -->
     <logback.version>1.1.0</logback.version>
     <h2.version>1.4.194</h2.version>
-    <tika.version>1.5</tika.version>
+    <tika.version>1.15</tika.version>
     <findbugs.version>3.0.4</findbugs.version>
     <derby.version>10.13.1.1</derby.version>
     <java.version>1.8</java.version>