You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jackrabbit.apache.org by un...@apache.org on 2012/12/03 15:51:53 UTC
svn commit: r1416552 - in /jackrabbit/trunk/jackrabbit-core/src:
main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java
Author: unico
Date: Mon Dec 3 14:51:52 2012
New Revision: 1416552
URL: http://svn.apache.org/viewvc?rev=1416552&view=rev
Log:
JCR-3476 only extract binary values when parser supports extracting them
Modified:
jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java
Modified: jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java?rev=1416552&r1=1416551&r2=1416552&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/NodeIndexer.java Mon Dec 3 14:51:52 2012
@@ -46,6 +46,7 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.index.FieldInfo;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -98,6 +99,11 @@ public class NodeIndexer {
private final Parser parser;
/**
+ * The media types supported by the parser used.
+ */
+ private Set<MediaType> supportedMediaTypes;
+
+ /**
* The indexing configuration or <code>null</code> if none is available.
*/
protected IndexingConfiguration indexingConfig;
@@ -421,7 +427,7 @@ public class NodeIndexer {
* <p/>
* This implementation checks if this {@link #node} is of type nt:resource
* and if that is the case, tries to extract text from the binary property
- * using the {@link #extractor}.
+ * using the {@link #parser}.
*
* @param doc The document to which to add the field
* @param fieldName The name of the field to add
@@ -439,7 +445,7 @@ public class NodeIndexer {
}
InternalValue type = getValue(NameConstants.JCR_MIMETYPE);
- if (type != null) {
+ if (type != null && isSupportedMediaType(type.getString())) {
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, type.getString());
@@ -654,7 +660,7 @@ public class NodeIndexer {
* @param doc The document to which to add the field
* @param fieldName The name of the field to add
* @param internalValue The value for the field to add to the document.
- * @deprecated Use {@link #addStringValue(Document, String, Object, boolean)
+ * @deprecated Use {@link #addStringValue(Document, String, String, boolean)
* addStringValue(Document, String, Object, boolean)} instead.
*/
protected void addStringValue(Document doc, String fieldName, String internalValue) {
@@ -692,7 +698,7 @@ public class NodeIndexer {
* tokenized and added to the node scope fulltext
* index.
* @param boost the boost value for this string field.
- * @deprecated use {@link #addStringValue(Document, String, Object, boolean, boolean, float, boolean)} instead.
+ * @deprecated use {@link #addStringValue(Document, String, String, boolean, boolean, float, boolean)} instead.
*/
protected void addStringValue(Document doc, String fieldName,
String internalValue, boolean tokenized,
@@ -917,6 +923,20 @@ public class NodeIndexer {
}
/**
+ * Returns <code>true</code> if the provided type is among the types
+ * supported by the Tika parser we are using.
+ *
+ * @param type the type to check.
+ * @return whether the type is supported by the Tika parser we are using.
+ */
+ protected boolean isSupportedMediaType(final String type) {
+ if (supportedMediaTypes == null) {
+ supportedMediaTypes = parser.getSupportedTypes(null);
+ }
+ return supportedMediaTypes.contains(MediaType.parse(type));
+ }
+
+ /**
* Returns the boost value for the given property name.
*
* @param propertyName the name of a property.
Modified: jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java?rev=1416552&r1=1416551&r2=1416552&view=diff
==============================================================================
--- jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java (original)
+++ jackrabbit/trunk/jackrabbit-core/src/test/java/org/apache/jackrabbit/core/query/lucene/TextExtractionQueryTest.java Mon Dec 3 14:51:52 2012
@@ -37,7 +37,7 @@ public class TextExtractionQueryTest ext
public void testFileContains() throws Exception {
assertFileContains("test.txt", "text/plain",
"AE502DBEA2C411DEBD340AD156D89593");
- assertFileContains("test.rtf", "text/rtf", "quick brown fox");
+ assertFileContains("test.rtf", "application/rtf", "quick brown fox");
}
public void testNtFile() throws RepositoryException, IOException {