You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by un...@apache.org on 2012/12/03 15:51:53 UTC
svn commit: r1416552 - in /jackrabbit/trunk/jackrabbit-core/src: main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java

Author: unico
Date: Mon Dec  3 14:51:52 2012
New Revision: 1416552

URL: http://svn.apache.org/viewvc?rev=1416552&view=rev
Log:
JCR-3476 only extract binary values when parser supports extracting them

Modified:
    jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
    jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java

Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?rev=1416552&r1=1416551&r2=1416552&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java Mon Dec  3 14:51:52 2012
@@ -46,6 +46,7 @@ import org.apache.lucene.document.Field;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.Parser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -98,6 +99,11 @@ public class NodeIndexer {
     private final Parser parser;
 
     /**
+     * The media types supported by the parser used.
+     */
+    private Set<MediaType> supportedMediaTypes;
+
+    /**
      * The indexing configuration or <code>null</code> if none is available.
      */
     protected IndexingConfiguration indexingConfig;
@@ -421,7 +427,7 @@ public class NodeIndexer {
      * <p/>
      * This implementation checks if this {@link #node} is of type nt:resource
      * and if that is the case, tries to extract text from the binary property
-     * using the {@link #extractor}.
+     * using the {@link #parser}.
      *
      * @param doc           The document to which to add the field
      * @param fieldName     The name of the field to add
@@ -439,7 +445,7 @@ public class NodeIndexer {
             }
 
             InternalValue type = getValue(NameConstants.JCR_MIMETYPE);
-            if (type != null) {
+            if (type != null && isSupportedMediaType(type.getString())) {
                 Metadata metadata = new Metadata();
                 metadata.set(Metadata.CONTENT_TYPE, type.getString());
 
@@ -654,7 +660,7 @@ public class NodeIndexer {
      * @param doc           The document to which to add the field
      * @param fieldName     The name of the field to add
      * @param internalValue The value for the field to add to the document.
-     * @deprecated Use {@link #addStringValue(Document, String, Object, boolean)
+     * @deprecated Use {@link #addStringValue(Document, String, String, boolean)
      *             addStringValue(Document, String, Object, boolean)} instead.
      */
     protected void addStringValue(Document doc, String fieldName, String internalValue) {
@@ -692,7 +698,7 @@ public class NodeIndexer {
      *                           tokenized and added to the node scope fulltext
      *                           index.
      * @param boost              the boost value for this string field.
-     * @deprecated use {@link #addStringValue(Document, String, Object, boolean, boolean, float, boolean)} instead.
+     * @deprecated use {@link #addStringValue(Document, String, String, boolean, boolean, float, boolean)} instead.
      */
     protected void addStringValue(Document doc, String fieldName,
                                   String internalValue, boolean tokenized,
@@ -917,6 +923,20 @@ public class NodeIndexer {
     }
 
     /**
+     * Returns <code>true</code> if the provided type is among the types
+     * supported by the Tika parser we are using.
+     *
+     * @param type  the type to check.
+     * @return whether the type is supported by the Tika parser we are using.
+     */
+    protected boolean isSupportedMediaType(final String type) {
+        if (supportedMediaTypes == null) {
+            supportedMediaTypes = parser.getSupportedTypes(null);
+        }
+        return supportedMediaTypes.contains(MediaType.parse(type));
+    }
+
+    /**
      * Returns the boost value for the given property name.
      *
      * @param propertyName the name of a property.

Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java?rev=1416552&r1=1416551&r2=1416552&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java Mon Dec  3 14:51:52 2012
@@ -37,7 +37,7 @@ public class TextExtractionQueryTest ext
     public void testFileContains() throws Exception {
         assertFileContains("test.txt", "text/plain",
                 "AE502DBEA2C411DEBD340AD156D89593");
-        assertFileContains("test.rtf", "text/rtf", "quick brown fox");
+        assertFileContains("test.rtf", "application/rtf", "quick brown fox");
     }
 
     public void testNtFile() throws RepositoryException, IOException {