You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/28 14:22:39 UTC

svn commit: r938966 - in /lucene/tika/trunk/tika-core/src/main/java/org/apache/tika: config/TikaConfig.java mime/MediaType.java parser/CompositeParser.java

Author: jukka
Date: Wed Apr 28 12:22:38 2010
New Revision: 938966

URL: http://svn.apache.org/viewvc?rev=938966&view=rev
Log:
TIKA-298: CompositeParser.getParser() should use mimetype hierarchy when falling back

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=938966&r1=938965&r2=938966&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Wed Apr 28 12:22:38 2010
@@ -47,7 +47,8 @@ import org.xml.sax.SAXException;
  */
 public class TikaConfig {
 
-    private final Map<String, Parser> parsers = new HashMap<String, Parser>();
+    private final Map<MediaType, Parser> parsers =
+        new HashMap<MediaType, Parser>();
 
     private final MimeTypes mimeTypes;
 
@@ -118,12 +119,19 @@ public class TikaConfig {
                 NodeList mimes = node.getElementsByTagName("mime");
                 if (mimes.getLength() > 0) {
                     for (int j = 0; j < mimes.getLength(); j++) {
-                        parsers.put(getText(mimes.item(j)).trim(), parser);
+                        String mime = getText(mimes.item(j));
+                        MediaType type = MediaType.parse(mime);
+                        if (type != null) {
+                            parsers.put(type, parser);
+                        } else {
+                            throw new TikaException(
+                                    "Invalid media type name: " + mime);
+                        }
                     }
                 } else {
                     ParseContext context = new ParseContext();
                     for (MediaType type : parser.getSupportedTypes(context)) {
-                        parsers.put(type.toString(), parser);
+                        parsers.put(type, parser);
                     }
                 }
             } catch (ClassNotFoundException e) {
@@ -146,7 +154,7 @@ public class TikaConfig {
         while (iterator.hasNext()) {
             Parser parser = iterator.next();
             for (MediaType type : parser.getSupportedTypes(context)) {
-                parsers.put(type.toString(), parser);
+                parsers.put(type, parser);
             }
         }
         mimeTypes = MimeTypesFactory.create("tika-mimetypes.xml");
@@ -187,7 +195,7 @@ public class TikaConfig {
         return parsers.get(mimeType);
     }
 
-    public Map<String, Parser> getParsers() {
+    public Map<MediaType, Parser> getParsers() {
         return parsers;
     }
 

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=938966&r1=938965&r2=938966&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java Wed Apr 28 12:22:38 2010
@@ -182,6 +182,17 @@ public final class MediaType {
         return subtype;
     }
 
+    /**
+     * Checks whether this media type contains parameters.
+     *
+     * @since Apache Tika 0.8
+     * @return <code>true</code> if this type has one or more parameters,
+     *         <code>false</code> otherwise
+     */
+    public boolean hasParameters() {
+        return !parameters.isEmpty();
+    }
+
     public Map<String, String> getParameters() {
         return Collections.unmodifiableMap(parameters);
     }

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=938966&r1=938965&r2=938966&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Wed Apr 28 12:22:38 2010
@@ -18,9 +18,7 @@ package org.apache.tika.parser;
 
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.Collections;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.Map;
 import java.util.Set;
 
@@ -43,7 +41,7 @@ public class CompositeParser implements 
     /**
      * Set of component parsers, keyed by the supported media types.
      */
-    private Map<String, Parser> parsers = new HashMap<String, Parser>();
+    private Map<MediaType, Parser> parsers = new HashMap<MediaType, Parser>();
 
     /**
      * The fallback parser, used when no better parser is available.
@@ -55,7 +53,7 @@ public class CompositeParser implements 
      *
      * @return component parsers, keyed by media type
      */
-    public Map<String, Parser> getParsers() {
+    public Map<MediaType, Parser> getParsers() {
         return parsers;
     }
 
@@ -64,7 +62,7 @@ public class CompositeParser implements 
      *
      * @param parsers component parsers, keyed by media type
      */
-    public void setParsers(Map<String, Parser> parsers) {
+    public void setParsers(Map<MediaType, Parser> parsers) {
         this.parsers = parsers;
     }
 
@@ -98,19 +96,31 @@ public class CompositeParser implements 
      * @return matching parser
      */
     protected Parser getParser(Metadata metadata) {
-        Parser parser = parsers.get(metadata.get(Metadata.CONTENT_TYPE));
-        if (parser == null) {
-            parser = fallback;
+        MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
+        if (type != null) {
+            Parser parser = parsers.get(type);
+
+            if (parser == null && type.hasParameters()) {
+                type = type.getBaseType();
+                parser = parsers.get(type);
+            }
+
+            if (parser != null) {
+                return parser;
+            } else {
+                for (MediaType parserType : parsers.keySet()) {
+                    if (parserType != null
+                            && type.isSpecializationOf(parserType)) {
+                        return parsers.get(parserType);
+                    }
+                }
+            }
         }
-        return parser;
+        return fallback;
     }
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
-        Set<MediaType> supportedTypes = new HashSet<MediaType>();
-        for (String type : parsers.keySet()) {
-            supportedTypes.add(MediaType.parse(type));
-        }
-        return Collections.unmodifiableSet(supportedTypes);
+        return parsers.keySet();
     }
 
     /**