You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2017/07/04 10:31:11 UTC
svn commit: r1800749 - in /jackrabbit/oak/trunk: oak-examples/standalone/
oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/
oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/
oak-parent/
Author: chetanm
Date: Tue Jul 4 10:31:11 2017
New Revision: 1800749
URL: http://svn.apache.org/viewvc?rev=1800749&view=rev
Log:
OAK-5048 - Upgrade to Tika 1.15 version
OAK-6414 - Use Tika config to determine non indexed mimeTypes
-- Update Tika to 1.15
-- Use TikaParserConfig to check which all mimetypes have been configured
with EmptyParser
-- oak-webapp - Need to exclude the httpcomponents from tika-parser
as it has a transitive dependency to an old version of http components
which is in conflict with one used by htmlunit.
Added:
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java (with props)
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java (with props)
Modified:
jackrabbit/oak/trunk/oak-examples/standalone/pom.xml
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
jackrabbit/oak/trunk/oak-parent/pom.xml
Modified: jackrabbit/oak/trunk/oak-examples/standalone/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-examples/standalone/pom.xml?rev=1800749&r1=1800748&r2=1800749&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-examples/standalone/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-examples/standalone/pom.xml Tue Jul 4 10:31:11 2017
@@ -133,6 +133,10 @@
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpcore</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java?rev=1800749&r1=1800748&r2=1800749&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java Tue Jul 4 10:31:11 2017
@@ -19,9 +19,11 @@
package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
+import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Set;
@@ -39,6 +41,7 @@ import org.apache.jackrabbit.oak.plugins
import org.apache.jackrabbit.oak.spi.state.NodeState;
import org.apache.lucene.document.Field;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -47,6 +50,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.WriteOutContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.xml.sax.SAXException;
import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory.newFulltextField;
@@ -65,6 +69,7 @@ public class BinaryTextExtractor {
* The media types supported by the parser used.
*/
private Set<MediaType> supportedMediaTypes;
+ private Set<MediaType> nonIndexedMediaType;
public BinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition definition, boolean reindex) {
this.extractedTextCache = extractedTextCache;
@@ -202,8 +207,36 @@ public class BinaryTextExtractor {
private boolean isSupportedMediaType(String type) {
if (supportedMediaTypes == null) {
supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
+ nonIndexedMediaType = getNonIndexedMediaTypes();
}
- return supportedMediaTypes.contains(MediaType.parse(type));
+ MediaType mediaType = MediaType.parse(type);
+ return supportedMediaTypes.contains(mediaType) && !nonIndexedMediaType.contains(mediaType);
+ }
+
+ private Set<MediaType> getNonIndexedMediaTypes() {
+ InputStream configStream = null;
+ String configSource = null;
+ try {
+ if (definition.hasCustomTikaConfig()) {
+ configSource = String.format("Custom config at %s", definition.getIndexPath());
+ configStream = definition.getTikaConfig();
+ } else {
+ URL configUrl = LuceneIndexEditorContext.class.getResource("tika-config.xml");
+ configSource = "Default : tika-config.xml";
+ if (configUrl != null) {
+ configStream = configUrl.openStream();
+ }
+ }
+
+ if (configStream != null) {
+ return TikaParserConfig.getNonIndexedMediaTypes(configStream);
+ }
+ } catch (TikaException | IOException | SAXException e) {
+ log.warn("Tika configuration not available : " + configSource, e);
+ } finally {
+ IOUtils.closeQuietly(configStream);
+ }
+ return Collections.emptySet();
}
private static Parser initializeTikaParser(IndexDefinition definition) {
Added: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java?rev=1800749&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java Tue Jul 4 10:31:11 2017
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.Set;
+
+import javax.xml.parsers.DocumentBuilder;
+
+import com.google.common.base.Strings;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+public class TikaParserConfig {
+ private static final String EMPTY_PARSER = "org.apache.tika.parser.EmptyParser";
+
+ /**
+ * Determines the set of MediaType which have been configured with an EmptyParser.
+ *
+ * @param configStream stream for tika config
+ * @return set of MediaTypes which are not indexed
+ */
+ public static Set<MediaType> getNonIndexedMediaTypes(InputStream configStream) throws
+ TikaException, IOException, SAXException {
+ Set<MediaType> result = new HashSet<>();
+ Element element = getBuilder().parse(configStream).getDocumentElement();
+ NodeList nodes = element.getElementsByTagName("parsers");
+ if (nodes.getLength() == 1) {
+ Node parentNode = nodes.item(0);
+ NodeList parsersNodes = parentNode.getChildNodes();
+ for (int i = 0; i < parsersNodes.getLength(); i++) {
+ Node node = parsersNodes.item(i);
+ if (node instanceof Element) {
+ String className = ((Element) node).getAttribute("class");
+ if (EMPTY_PARSER.equals(className)) {
+ NodeList mimes = ((Element) node).getElementsByTagName("mime");
+ parseMimeTypes(result, mimes);
+ }
+ }
+ }
+ }
+ return result;
+ }
+
+
+ private static void parseMimeTypes(Set<MediaType> result, NodeList mimes) {
+ /*
+ <parser class="org.apache.tika.parser.EmptyParser">
+ <mime>application/x-archive</mime>
+ <mime>application/x-bzip</mime>
+ <mime>application/x-bzip2</mime>
+ </parser>
+ */
+ for (int j = 0; j < mimes.getLength(); j++) {
+ Node mime = mimes.item(j);
+ if (mime instanceof Element) {
+ String mimeValue = mime.getTextContent();
+ mimeValue = Strings.emptyToNull(mimeValue);
+ if (mimeValue != null) {
+ MediaType mediaType = MediaType.parse(mimeValue.trim());
+ if (mediaType != null) {
+ result.add(mediaType);
+ }
+ }
+ }
+ }
+ }
+
+ private static DocumentBuilder getBuilder() throws TikaException {
+ return new ParseContext().getDocumentBuilder();
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfig.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java?rev=1800749&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java (added)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java Tue Jul 4 10:31:11 2017
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.index.lucene.binary;
+
+import java.io.StringReader;
+import java.util.Set;
+
+import org.apache.commons.io.input.ReaderInputStream;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class TikaParserConfigTest {
+
+ @Test
+ public void emptyParser() throws Exception{
+ String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
+ "<properties>\n" +
+ " <detectors>\n" +
+ " <detector class=\"org.apache.tika.detect.TypeDetector\"/>\n" +
+ " </detectors>\n" +
+ " <parsers>\n" +
+ " <parser class=\"org.apache.tika.parser.DefaultParser\"/>\n" +
+ " <parser class=\"org.apache.tika.parser.EmptyParser\">\n" +
+ " <mime>application/x-archive</mime>\n" +
+ " <mime>application/x-bzip</mime>\n" +
+ " </parser>\n" +
+ " </parsers>\n" +
+ "</properties>";
+
+ Set<MediaType> types = TikaParserConfig.getNonIndexedMediaTypes(
+ new ReaderInputStream(new StringReader(xml), "UTF-8"));
+ assertEquals(2, types.size());
+ assertTrue(types.contains(MediaType.parse("application/x-archive")));
+ }
+
+}
\ No newline at end of file
Propchange: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/TikaParserConfigTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-parent/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-parent/pom.xml?rev=1800749&r1=1800748&r2=1800749&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-parent/pom.xml (original)
+++ jackrabbit/oak/trunk/oak-parent/pom.xml Tue Jul 4 10:31:11 2017
@@ -61,7 +61,7 @@
<slf4j.version>1.7.6</slf4j.version> <!-- sync with logback version -->
<logback.version>1.1.0</logback.version>
<h2.version>1.4.194</h2.version>
- <tika.version>1.5</tika.version>
+ <tika.version>1.15</tika.version>
<findbugs.version>3.0.4</findbugs.version>
<derby.version>10.13.1.1</derby.version>
<java.version>1.8</java.version>