You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2017/08/25 07:40:44 UTC

svn commit: r1806131 - in /jackrabbit/oak/trunk: oak-doc/src/site/markdown/query/ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/ oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/ oak-lucene/src/te...

Author: chetanm
Date: Fri Aug 25 07:40:44 2017
New Revision: 1806131

URL: http://svn.apache.org/viewvc?rev=1806131&view=rev
Log:
OAK-6587 - Provide a way to "force" Tika to treat binaries with a different mime type than the jcr:mimeType property

Applying slightly modified patch provided by Justin Edelson

This closes #65

Modified:
    jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
    jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
    jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java

Modified: jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md (original)
+++ jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md Fri Aug 25 07:40:44 2017
@@ -37,6 +37,7 @@
 * [LuceneIndexProvider Configuration](#osgi-config)
 * [Tika Config](#tika-config)
     * [Mime type usage](#mime-type-usage)
+    * [Mime type mapping](#mime-type-mapping)
 * [Non Root Index Definitions](#non-root-index)
 * [Native Query and Index Selection](#native-query)
 * [CopyOnRead](#copy-on-read)
@@ -969,6 +970,23 @@ and that is supported by Tika. By defaul
 instead of default `DefaultDetector` which relies on the `jcr:mimeType` to pick up the
 right parser. 
 
+#### <a name="mime-type-mapping"></a>Mime type mapping
+
+`@since Oak 1.7.7`
+
+In certain circumstances, it may be desired to pass a value other than the `jcr:mimeType` property
+into the Tika parser. For example, this would be necessary if a binary has an application-specific 
+mime type, but is parsable by the standard Tika parser for some generic type. To support these cases,
+create a node structure under the `tika/mimeTypes` node following the mime type structure, e.g.
+
+    + tika
+        + mimeTypes (nt:unstructured)
+          + application (nt:unstructured)
+            + vnd.mycompany-document (nt:unstructured)
+              - mappedType = application/pdf
+
+When this index is indexing a binary of type `application/vnd.mycompany-document` it will force Tika
+to treat it as a binary of type `application/pdf`.
 
 ### <a name="non-root-index"></a>Non Root Index Definitions
 

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java Fri Aug 25 07:40:44 2017
@@ -228,6 +228,8 @@ public final class IndexDefinition imple
 
     private final boolean hasCustomTikaConfig;
 
+    private final Map<String, String> customTikaMimeTypeMappings;
+
     private final int maxFieldLength;
 
     private final int maxExtractLength;
@@ -376,6 +378,7 @@ public final class IndexDefinition imple
         this.analyzers = collectAnalyzers(defn);
         this.analyzer = createAnalyzer();
         this.hasCustomTikaConfig = getTikaConfigNode().exists();
+        this.customTikaMimeTypeMappings = buildMimeTypeMap(definition.getChildNode(TIKA).getChildNode(TIKA_MIME_TYPES));
         this.maxExtractLength = determineMaxExtractLength();
         this.suggesterUpdateFrequencyMinutes = evaluateSuggesterUpdateFrequencyMinutes(defn,
                 DEFAULT_SUGGESTER_UPDATE_FREQUENCY_MINUTES);
@@ -506,6 +509,10 @@ public final class IndexDefinition imple
         return ConfigUtil.getBlob(getTikaConfigNode(), TIKA_CONFIG).getNewStream();
     }
 
+    public String getTikaMappedMimeType(String type) {
+        return customTikaMimeTypeMappings.getOrDefault(type, type);
+    }
+
     public String getIndexName() {
         return indexName;
     }
@@ -1752,4 +1759,20 @@ public final class IndexDefinition imple
         return storedState.exists() ? storedState : defn;
     }
 
+    private static Map<String, String> buildMimeTypeMap(NodeState node) {
+        ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
+        for (ChildNodeEntry child : node.getChildNodeEntries()) {
+            for (ChildNodeEntry subChild : child.getNodeState().getChildNodeEntries()) {
+                StringBuilder typeBuilder = new StringBuilder(child.getName())
+                        .append('/')
+                        .append(subChild.getName());
+                PropertyState property = subChild.getNodeState().getProperty(TIKA_MAPPED_TYPE);
+                if (property != null) {
+                    map.put(typeBuilder.toString(), property.getValue(Type.STRING));
+                }
+            }
+        }
+        return map.build();
+    }
+
 }

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java Fri Aug 25 07:40:44 2017
@@ -247,6 +247,16 @@ public interface LuceneIndexConstants {
     String TIKA_MAX_EXTRACT_LENGTH = "maxExtractLength";
 
     /**
+     *  Config node under tika which defines mime type mappings
+     */
+    String TIKA_MIME_TYPES = "mimeTypes";
+
+    /**
+     * Property name within the mime type structure which defines a mime type mapping
+     */
+    String TIKA_MAPPED_TYPE = "mappedType";
+
+    /**
      * The maximum number of terms that will be indexed for a single field in a
      * document.  This limits the amount of memory required for indexing, so that
      * collections with very large files will not crash the indexing process by

Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java Fri Aug 25 07:40:44 2017
@@ -92,6 +92,7 @@ public class BinaryTextExtractor {
 
         //jcr:mimeType is mandatory for a binary to be indexed
         String type = state.getString(JcrConstants.JCR_MIMETYPE);
+        type = definition.getTikaMappedMimeType(type);
 
         if (type == null || !isSupportedMediaType(type)) {
             log.trace(

Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java Fri Aug 25 07:40:44 2017
@@ -615,6 +615,24 @@ public class IndexDefinitionTest {
     }
 
     @Test
+    public void customTikaMimeTypes() throws Exception{
+        NodeBuilder defnb = newLuceneIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME),
+                "lucene", of(TYPENAME_STRING));
+        IndexDefinition defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
+        assertEquals("application/test", defn.getTikaMappedMimeType("application/test"));
+
+        NodeBuilder app =defnb.child(LuceneIndexConstants.TIKA)
+                .child(LuceneIndexConstants.TIKA_MIME_TYPES)
+                .child("application");
+        app.child("test").setProperty(LuceneIndexConstants.TIKA_MAPPED_TYPE, "text/plain");
+        app.child("test2").setProperty(LuceneIndexConstants.TIKA_MAPPED_TYPE, "text/plain");
+        defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
+        assertEquals("text/plain", defn.getTikaMappedMimeType("application/test"));
+        assertEquals("text/plain", defn.getTikaMappedMimeType("application/test2"));
+        assertEquals("application/test-unmapped", defn.getTikaMappedMimeType("application/test-unmapped"));
+    }
+
+    @Test
     public void maxExtractLength() throws Exception{
         NodeBuilder defnb = newLuceneIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME),
                 "lucene", of(TYPENAME_STRING));