You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ch...@apache.org on 2017/08/25 07:40:44 UTC
svn commit: r1806131 - in /jackrabbit/oak/trunk:
oak-doc/src/site/markdown/query/
oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/
oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/
oak-lucene/src/te...
Author: chetanm
Date: Fri Aug 25 07:40:44 2017
New Revision: 1806131
URL: http://svn.apache.org/viewvc?rev=1806131&view=rev
Log:
OAK-6587 - Provide a way to "force" Tika to treat binaries with a different mime type than the jcr:mimeType property
Applying slightly modified patch provided by Justin Edelson
This closes #65
Modified:
jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java
Modified: jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md (original)
+++ jackrabbit/oak/trunk/oak-doc/src/site/markdown/query/lucene.md Fri Aug 25 07:40:44 2017
@@ -37,6 +37,7 @@
* [LuceneIndexProvider Configuration](#osgi-config)
* [Tika Config](#tika-config)
* [Mime type usage](#mime-type-usage)
+ * [Mime type mapping](#mime-type-mapping)
* [Non Root Index Definitions](#non-root-index)
* [Native Query and Index Selection](#native-query)
* [CopyOnRead](#copy-on-read)
@@ -969,6 +970,23 @@ and that is supported by Tika. By defaul
instead of default `DefaultDetector` which relies on the `jcr:mimeType` to pick up the
right parser.
+#### <a name="mime-type-mapping"></a>Mime type mapping
+
+`@since Oak 1.7.7`
+
+In certain circumstances, it may be desired to pass a value other than the `jcr:mimeType` property
+into the Tika parser. For example, this would be necessary if a binary has an application-specific
+mime type, but is parsable by the standard Tika parser for some generic type. To support these cases,
+create a node structure under the `tika/mimeTypes` node following the mime type structure, e.g.
+
+ + tika
+ + mimeTypes (nt:unstructured)
+ + application (nt:unstructured)
+ + vnd.mycompany-document (nt:unstructured)
+ - mappedType = application/pdf
+
+When this index is indexing a binary of type `application/vnd.mycompany-document` it will force Tika
+to treat it as a binary of type `application/pdf`.
### <a name="non-root-index"></a>Non Root Index Definitions
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinition.java Fri Aug 25 07:40:44 2017
@@ -228,6 +228,8 @@ public final class IndexDefinition imple
private final boolean hasCustomTikaConfig;
+ private final Map<String, String> customTikaMimeTypeMappings;
+
private final int maxFieldLength;
private final int maxExtractLength;
@@ -376,6 +378,7 @@ public final class IndexDefinition imple
this.analyzers = collectAnalyzers(defn);
this.analyzer = createAnalyzer();
this.hasCustomTikaConfig = getTikaConfigNode().exists();
+ this.customTikaMimeTypeMappings = buildMimeTypeMap(definition.getChildNode(TIKA).getChildNode(TIKA_MIME_TYPES));
this.maxExtractLength = determineMaxExtractLength();
this.suggesterUpdateFrequencyMinutes = evaluateSuggesterUpdateFrequencyMinutes(defn,
DEFAULT_SUGGESTER_UPDATE_FREQUENCY_MINUTES);
@@ -506,6 +509,10 @@ public final class IndexDefinition imple
return ConfigUtil.getBlob(getTikaConfigNode(), TIKA_CONFIG).getNewStream();
}
+ public String getTikaMappedMimeType(String type) {
+ return customTikaMimeTypeMappings.getOrDefault(type, type);
+ }
+
public String getIndexName() {
return indexName;
}
@@ -1752,4 +1759,20 @@ public final class IndexDefinition imple
return storedState.exists() ? storedState : defn;
}
+ private static Map<String, String> buildMimeTypeMap(NodeState node) {
+ ImmutableMap.Builder<String, String> map = ImmutableMap.builder();
+ for (ChildNodeEntry child : node.getChildNodeEntries()) {
+ for (ChildNodeEntry subChild : child.getNodeState().getChildNodeEntries()) {
+ StringBuilder typeBuilder = new StringBuilder(child.getName())
+ .append('/')
+ .append(subChild.getName());
+ PropertyState property = subChild.getNodeState().getProperty(TIKA_MAPPED_TYPE);
+ if (property != null) {
+ map.put(typeBuilder.toString(), property.getValue(Type.STRING));
+ }
+ }
+ }
+ return map.build();
+ }
+
}
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/LuceneIndexConstants.java Fri Aug 25 07:40:44 2017
@@ -247,6 +247,16 @@ public interface LuceneIndexConstants {
String TIKA_MAX_EXTRACT_LENGTH = "maxExtractLength";
/**
+ * Config node under tika which defines mime type mappings
+ */
+ String TIKA_MIME_TYPES = "mimeTypes";
+
+ /**
+ * Property name within the mime type structure which defines a mime type mapping
+ */
+ String TIKA_MAPPED_TYPE = "mappedType";
+
+ /**
* The maximum number of terms that will be indexed for a single field in a
* document. This limits the amount of memory required for indexing, so that
* collections with very large files will not crash the indexing process by
Modified: jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/main/java/org/apache/jackrabbit/oak/plugins/index/lucene/binary/BinaryTextExtractor.java Fri Aug 25 07:40:44 2017
@@ -92,6 +92,7 @@ public class BinaryTextExtractor {
//jcr:mimeType is mandatory for a binary to be indexed
String type = state.getString(JcrConstants.JCR_MIMETYPE);
+ type = definition.getTikaMappedMimeType(type);
if (type == null || !isSupportedMediaType(type)) {
log.trace(
Modified: jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java?rev=1806131&r1=1806130&r2=1806131&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java (original)
+++ jackrabbit/oak/trunk/oak-lucene/src/test/java/org/apache/jackrabbit/oak/plugins/index/lucene/IndexDefinitionTest.java Fri Aug 25 07:40:44 2017
@@ -615,6 +615,24 @@ public class IndexDefinitionTest {
}
@Test
+ public void customTikaMimeTypes() throws Exception{
+ NodeBuilder defnb = newLuceneIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME),
+ "lucene", of(TYPENAME_STRING));
+ IndexDefinition defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
+ assertEquals("application/test", defn.getTikaMappedMimeType("application/test"));
+
+ NodeBuilder app =defnb.child(LuceneIndexConstants.TIKA)
+ .child(LuceneIndexConstants.TIKA_MIME_TYPES)
+ .child("application");
+ app.child("test").setProperty(LuceneIndexConstants.TIKA_MAPPED_TYPE, "text/plain");
+ app.child("test2").setProperty(LuceneIndexConstants.TIKA_MAPPED_TYPE, "text/plain");
+ defn = new IndexDefinition(root, defnb.getNodeState(), "/foo");
+ assertEquals("text/plain", defn.getTikaMappedMimeType("application/test"));
+ assertEquals("text/plain", defn.getTikaMappedMimeType("application/test2"));
+ assertEquals("application/test-unmapped", defn.getTikaMappedMimeType("application/test-unmapped"));
+ }
+
+ @Test
public void maxExtractLength() throws Exception{
NodeBuilder defnb = newLuceneIndexDefinition(builder.child(INDEX_DEFINITIONS_NAME),
"lucene", of(TYPENAME_STRING));