You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/28 14:22:39 UTC
svn commit: r938966 - in
/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika:
config/TikaConfig.java mime/MediaType.java parser/CompositeParser.java
Author: jukka
Date: Wed Apr 28 12:22:38 2010
New Revision: 938966
URL: http://svn.apache.org/viewvc?rev=938966&view=rev
Log:
TIKA-298: CompositeParser.getParser() should use mimetype hierarchy when falling back
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=938966&r1=938965&r2=938966&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Wed Apr 28 12:22:38 2010
@@ -47,7 +47,8 @@ import org.xml.sax.SAXException;
*/
public class TikaConfig {
- private final Map<String, Parser> parsers = new HashMap<String, Parser>();
+ private final Map<MediaType, Parser> parsers =
+ new HashMap<MediaType, Parser>();
private final MimeTypes mimeTypes;
@@ -118,12 +119,19 @@ public class TikaConfig {
NodeList mimes = node.getElementsByTagName("mime");
if (mimes.getLength() > 0) {
for (int j = 0; j < mimes.getLength(); j++) {
- parsers.put(getText(mimes.item(j)).trim(), parser);
+ String mime = getText(mimes.item(j));
+ MediaType type = MediaType.parse(mime);
+ if (type != null) {
+ parsers.put(type, parser);
+ } else {
+ throw new TikaException(
+ "Invalid media type name: " + mime);
+ }
}
} else {
ParseContext context = new ParseContext();
for (MediaType type : parser.getSupportedTypes(context)) {
- parsers.put(type.toString(), parser);
+ parsers.put(type, parser);
}
}
} catch (ClassNotFoundException e) {
@@ -146,7 +154,7 @@ public class TikaConfig {
while (iterator.hasNext()) {
Parser parser = iterator.next();
for (MediaType type : parser.getSupportedTypes(context)) {
- parsers.put(type.toString(), parser);
+ parsers.put(type, parser);
}
}
mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
@@ -187,7 +195,7 @@ public class TikaConfig {
return parsers.get(mimeType);
}
- public Map<String, Parser> getParsers() {
+ public Map<MediaType, Parser> getParsers() {
return parsers;
}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=938966&r1=938965&r2=938966&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java Wed Apr 28 12:22:38 2010
@@ -182,6 +182,17 @@ public final class MediaType {
return subtype;
}
+ /**
+ * Checks whether this media type contains parameters.
+ *
+ * @since Apache Tika 0.8
+ * @return <code>true</code> if this type has one or more parameters,
+ * <code>false</code> otherwise
+ */
+ public boolean hasParameters() {
+ return !parameters.isEmpty();
+ }
+
public Map<String, String> getParameters() {
return Collections.unmodifiableMap(parameters);
}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=938966&r1=938965&r2=938966&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Wed Apr 28 12:22:38 2010
@@ -18,9 +18,7 @@ package org.apache.tika.parser;
import java.io.IOException;
import java.io.InputStream;
-import java.util.Collections;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Map;
import java.util.Set;
@@ -43,7 +41,7 @@ public class CompositeParser implements
/**
* Set of component parsers, keyed by the supported media types.
*/
- private Map<String, Parser> parsers = new HashMap<String, Parser>();
+ private Map<MediaType, Parser> parsers = new HashMap<MediaType, Parser>();
/**
* The fallback parser, used when no better parser is available.
@@ -55,7 +53,7 @@ public class CompositeParser implements
*
* @return component parsers, keyed by media type
*/
- public Map<String, Parser> getParsers() {
+ public Map<MediaType, Parser> getParsers() {
return parsers;
}
@@ -64,7 +62,7 @@ public class CompositeParser implements
*
* @param parsers component parsers, keyed by media type
*/
- public void setParsers(Map<String, Parser> parsers) {
+ public void setParsers(Map<MediaType, Parser> parsers) {
this.parsers = parsers;
}
@@ -98,19 +96,31 @@ public class CompositeParser implements
* @return matching parser
*/
protected Parser getParser(Metadata metadata) {
- Parser parser = parsers.get(metadata.get(Metadata.CONTENT_TYPE));
- if (parser == null) {
- parser = fallback;
+ MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+ if (type != null) {
+ Parser parser = parsers.get(type);
+
+ if (parser == null && type.hasParameters()) {
+ type = type.getBaseType();
+ parser = parsers.get(type);
+ }
+
+ if (parser != null) {
+ return parser;
+ } else {
+ for (MediaType parserType : parsers.keySet()) {
+ if (parserType != null
+ && type.isSpecializationOf(parserType)) {
+ return parsers.get(parserType);
+ }
+ }
+ }
}
- return parser;
+ return fallback;
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
- Set<MediaType> supportedTypes = new HashSet<MediaType>();
- for (String type : parsers.keySet()) {
- supportedTypes.add(MediaType.parse(type));
- }
- return Collections.unmodifiableSet(supportedTypes);
+ return parsers.keySet();
}
/**