You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/06/18 14:03:09 UTC
svn commit: r955959 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/config/
tika-core/src/main/java/org/apache/tika/detect/
tika-core/src/main/java/org/apache/tika/mime/
tika-core/src/main/java/org/apache/tika/parser/ tika-core/src/test/java/...
Author: jukka
Date: Fri Jun 18 12:03:08 2010
New Revision: 955959
URL: http://svn.apache.org/viewvc?rev=955959&view=rev
Log:
TIKA-308: Improve supertype handling in type registry
Move all supertype information into MediaTypeRegistry and leverage it in CompositeParser and elsewhere
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Fri Jun 18 12:03:08 2010
@@ -31,6 +31,7 @@ import javax.xml.parsers.ParserConfigura
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
@@ -251,6 +252,10 @@ public class TikaConfig {
return mimeTypes;
}
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return mimeTypes.getMediaTypeRegistry();
+ }
+
/**
* Provides a default configuration (TikaConfig). Currently creates a
* new instance each time it's called; we may be able to have it
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java Fri Jun 18 12:03:08 2010
@@ -23,20 +23,34 @@ import java.util.List;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
/**
* Content type detector that combines multiple different detection mechanisms.
*/
public class CompositeDetector implements Detector {
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 5980683158436430252L;
+
+ private final MediaTypeRegistry registry;
+
private final List<Detector> detectors;
- public CompositeDetector(List<Detector> detectors) {
+ public CompositeDetector(
+ MediaTypeRegistry registry, List<Detector> detectors) {
+ this.registry = registry;
this.detectors = detectors;
}
+ public CompositeDetector(List<Detector> detectors) {
+ this(new MediaTypeRegistry(), detectors);
+ }
+
public CompositeDetector(Detector... detectors) {
- this.detectors = Arrays.asList(detectors);
+ this(Arrays.asList(detectors));
}
public MediaType detect(InputStream input, Metadata metadata)
@@ -44,7 +58,7 @@ public class CompositeDetector implement
MediaType type = MediaType.OCTET_STREAM;
for (Detector detector : detectors) {
MediaType detected = detector.detect(input, metadata);
- if (detected.isSpecializationOf(type)) {
+ if (registry.isSpecializationOf(detected, type)) {
type = detected;
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaType.java Fri Jun 18 12:03:08 2010
@@ -205,23 +205,6 @@ public final class MediaType implements
return Collections.unmodifiableMap(parameters);
}
- public boolean isSpecializationOf(MediaType that) {
- if (OCTET_STREAM.equals(that)) {
- return true;
- } else if (!type.equals(that.type)) {
- return false;
- } else if (!parameters.entrySet().containsAll(that.parameters.entrySet())) {
- return false;
- } else if (TEXT_PLAIN.equals(that.getBaseType())) {
- return true;
- } else if (APPLICATION_XML.equals(that.getBaseType())
- && subtype.endsWith("+xml")) {
- return true;
- } else {
- return subtype.equals(that.subtype);
- }
- }
-
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append(type);
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java Fri Jun 18 12:03:08 2010
@@ -18,6 +18,8 @@ package org.apache.tika.mime;
import java.util.HashMap;
import java.util.Map;
+import java.util.SortedSet;
+import java.util.TreeSet;
/**
* Registry of known Internet media types.
@@ -25,8 +27,7 @@ import java.util.Map;
public class MediaTypeRegistry {
/**
- * Registry of known media types, including type aliases. All the types
- * in this map are base types, i.e. they have no parameters. A canonical
+ * Registry of known media types, including type aliases. A canonical
* media type is handled as an identity mapping, while an alias is stored
* as a mapping from the alias to the corresponding canonical type.
*/
@@ -35,31 +36,49 @@ public class MediaTypeRegistry {
/**
* Known type inheritance relationships. The mapping is from a media type
- * to the closest supertype. All types in this map are canonical and have
- * no parameters.
+ * to the closest supertype.
*/
private final Map<MediaType, MediaType> inheritance =
new HashMap<MediaType, MediaType>();
- public void addType(MediaType type) {
- if (type == null || type.hasParameters()) {
- throw new IllegalArgumentException();
- } else if (registry.containsKey(type)) {
- throw new IllegalStateException();
- } else {
- registry.put(type, type);
+ /**
+ * Returns the set of all known canonical media types. Type aliases are
+ * not included in the returned set.
+ *
+ * @since Apache Tika 0.8
+ * @return canonical media types
+ */
+ public SortedSet<MediaType> getTypes() {
+ return new TreeSet<MediaType>(registry.values());
+ }
+
+ /**
+ * Returns the set of known aliases of the given canonical media type.
+ *
+ * @since Apache Tika 0.8
+ * @param type canonical media type
+ * @return known aliases
+ */
+ public SortedSet<MediaType> getAliases(MediaType type) {
+ SortedSet<MediaType> aliases = new TreeSet<MediaType>();
+ for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) {
+ if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
+ aliases.add(entry.getKey());
+ }
}
+ return aliases;
+ }
+
+ public void addType(MediaType type) {
+ registry.put(type, type);
}
public void addAlias(MediaType type, MediaType alias) {
- if (type == null || alias == null
- || type.hasParameters() || alias.hasParameters()) {
- throw new IllegalArgumentException();
- } else if (!registry.containsKey(type) || registry.containsKey(alias)) {
- throw new IllegalStateException();
- } else {
- registry.put(alias, type);
- }
+ registry.put(alias, type);
+ }
+
+ public void addSuperType(MediaType type, MediaType supertype) {
+ inheritance.put(type, supertype);
}
public MediaType normalize(MediaType type) {
@@ -74,6 +93,20 @@ public class MediaTypeRegistry {
}
/**
+ * Checks whether the given media type a is a specialization of a more
+ * generic type b.
+ *
+ * @param a media type
+ * @param b suspected supertype
+ * @return <code>true</code> if b is a supertype of a,
+ * <code>false</code> otherwise
+ */
+ public boolean isSpecializationOf(MediaType a, MediaType b) {
+ MediaType x = getSuperType(a);
+ return x != null && (x.equals(b) || isSpecializationOf(x, b));
+ }
+
+ /**
* Returns the supertype of the given type. If the given type has any
* parameters, then the respective base type is returned. Otherwise
* built-in heuristics like text/... -> text/plain and
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeType.java Fri Jun 18 12:03:08 2010
@@ -68,11 +68,6 @@ public final class MimeType implements C
}
/**
- * The media type registry that contains this type.
- */
- private final MimeTypes registry;
-
- /**
* The normalized media type name.
*/
private final MediaType type;
@@ -82,12 +77,6 @@ public final class MimeType implements C
*/
private String description = "";
- /**
- * The parent type of this media type, or <code>null</code> if this
- * is a top-level type.
- */
- private MimeType superType = null;
-
/** The magics associated to this Mime-Type */
private final ArrayList<Magic> magics = new ArrayList<Magic>();
@@ -104,17 +93,12 @@ public final class MimeType implements C
* {@link MimeTypes#forName(String)} to keep the media type registry
* up to date.
*
- * @param registry the media type registry that contains this type
* @param type normalized media type name
*/
- MimeType(MimeTypes registry, MediaType type) {
- if (registry == null) {
- throw new IllegalArgumentException("Registry is missing");
- }
+ MimeType(MediaType type) {
if (type == null) {
throw new IllegalArgumentException("Media type name is missing");
}
- this.registry = registry;
this.type = type;
}
@@ -137,54 +121,6 @@ public final class MimeType implements C
}
/**
- * Returns the parent of this media type.
- *
- * @return parent media type, or <code>null</code>
- */
- public MimeType getSuperType() {
- return superType;
- }
-
- public void setSuperType(MimeType type) throws MimeTypeException {
- if (type == null) {
- throw new IllegalArgumentException("MimeType is missing");
- }
- if (type.registry != registry) {
- throw new IllegalArgumentException("MimeType is from a different registry");
- }
- if (this.isDescendantOf(type)) {
- // ignore, already a descendant of the given type
- } else if (this == type) {
- throw new MimeTypeException(
- "Media type can not inherit itself: " + type);
- } else if (type.isDescendantOf(this)) {
- throw new MimeTypeException(
- "Media type can not inherit its descendant: " + type);
- } else if (superType == null) {
- superType = type;
- } else if (type.isDescendantOf(superType)) {
- superType = type;
- } else {
- throw new MimeTypeException(
- "Conflicting media type inheritance: " + type);
- }
- }
-
- public boolean isDescendantOf(MimeType type) {
- if (type == null) {
- throw new IllegalArgumentException("MimeType is missing");
- }
- synchronized (registry) {
- for (MimeType t = superType; t != null; t = t.superType) {
- if (t == type) {
- return true;
- }
- }
- return false;
- }
- }
-
- /**
* Returns the description of this media type.
*
* @return media type description
@@ -206,19 +142,6 @@ public final class MimeType implements C
}
/**
- * Adds an alias name for this media type.
- *
- * @param alias media type alias (case insensitive)
- * @throws MimeTypeException if the alias is invalid or
- * already registered for another media type
- */
- public void addAlias(MediaType alias) throws MimeTypeException {
- if (!alias.isSpecializationOf(type)) {
- registry.addAlias(this, alias);
- }
- }
-
- /**
* Add some rootXML info to this mime-type
*
* @param namespaceURI
@@ -363,21 +286,7 @@ public final class MimeType implements C
//----------------------------------------------------------< Comparable >
public int compareTo(MimeType mime) {
- if (mime == null) {
- throw new IllegalArgumentException("MimeType is missing");
- } else if (mime == this) {
- return 0;
- } else if (this.isDescendantOf(mime)) {
- return 1;
- } else if (mime.isDescendantOf(this)) {
- return -1;
- } else if (superType != null) {
- return superType.compareTo(mime);
- } else if (mime.superType != null) {
- return compareTo(mime.superType);
- } else {
- return type.compareTo(mime.type);
- }
+ return type.compareTo(mime.type);
}
//--------------------------------------------------------------< Object >
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Fri Jun 18 12:03:08 2010
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -125,13 +125,18 @@ public final class MimeTypes implements
* xml type, application/xml
*/
private final MimeType xmlMimeType;
-
- /** All the registered MimeTypes indexed on their name */
+
+ /**
+ * Registered media types and their aliases.
+ */
+ private final MediaTypeRegistry registry = new MediaTypeRegistry();
+
+ /** All the registered MimeTypes indexed on their canonical names */
private final Map<MediaType, MimeType> types =
new HashMap<MediaType, MimeType>();
/** The patterns matcher */
- private Patterns patterns = new Patterns();
+ private Patterns patterns = new Patterns(registry);
/** List of all registered magics */
private SortedSet<Magic> magics = new TreeSet<Magic>();
@@ -142,20 +147,13 @@ public final class MimeTypes implements
private transient XmlRootExtractor xmlRootExtractor = null;
public MimeTypes() {
- rootMimeType = new MimeType(this, MediaType.OCTET_STREAM);
- textMimeType = new MimeType(this, MediaType.TEXT_PLAIN);
- xmlMimeType = new MimeType(this, MediaType.APPLICATION_XML);
+ rootMimeType = new MimeType(MediaType.OCTET_STREAM);
+ textMimeType = new MimeType(MediaType.TEXT_PLAIN);
+ xmlMimeType = new MimeType(MediaType.APPLICATION_XML);
- try {
- textMimeType.setSuperType(rootMimeType);
- xmlMimeType.setSuperType(rootMimeType);
- } catch (MimeTypeException e) {
- throw new IllegalStateException("Error in MimeType logic", e);
- }
-
- types.put(rootMimeType.getType(), rootMimeType);
- types.put(textMimeType.getType(), textMimeType);
- types.put(xmlMimeType.getType(), xmlMimeType);
+ add(rootMimeType);
+ add(textMimeType);
+ add(xmlMimeType);
}
/**
@@ -411,16 +409,10 @@ public final class MimeTypes implements
throws MimeTypeException {
MediaType type = MediaType.parse(name);
if (type != null) {
- MimeType mime = types.get(type);
+ MimeType mime = types.get(registry.normalize(type));
if (mime == null) {
- mime = new MimeType(this, type);
- if ("text".equals(type.getType())) {
- mime.setSuperType(textMimeType);
- } else if (type.getSubtype().endsWith("+xml")) {
- mime.setSuperType(xmlMimeType);
- } else {
- mime.setSuperType(rootMimeType);
- }
+ mime = new MimeType(type);
+ add(mime);
types.put(type, mime);
}
return mime;
@@ -429,22 +421,19 @@ public final class MimeTypes implements
}
}
+ public synchronized void setSuperType(MimeType type, MediaType parent) {
+ registry.addSuperType(type.getType(), parent);
+ }
+
/**
* Adds an alias for the given media type. This method should only
* be called from {@link MimeType#addAlias(String)}.
*
* @param type media type
* @param alias media type alias (normalized to lower case)
- * @throws MimeTypeException if the alias already exists
*/
- synchronized void addAlias(MimeType type, MediaType alias)
- throws MimeTypeException {
- if (!types.containsKey(alias)) {
- types.put(alias, type);
- } else {
- throw new MimeTypeException(
- "Media type alias already exists: " + alias);
- }
+ synchronized void addAlias(MimeType type, MediaType alias) {
+ registry.addAlias(type.getType(), alias);
}
/**
@@ -486,6 +475,10 @@ public final class MimeTypes implements
patterns.add(pattern, isRegex, type);
}
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return registry;
+ }
+
/**
* Return the minimum length of data to provide to analyzing methods based
* on the document's content in order to check all the known MimeTypes.
@@ -507,6 +500,9 @@ public final class MimeTypes implements
* is the mime-type to add.
*/
void add(MimeType type) {
+ registry.addType(type.getType());
+ types.put(type.getType(), type);
+
// Update the magics index...
if (type.hasMagic()) {
magics.addAll(Arrays.asList(type.getMagics()));
@@ -533,14 +529,14 @@ public final class MimeTypes implements
*/
public MediaType detect(InputStream input, Metadata metadata)
throws IOException {
- MimeType type = rootMimeType;
+ MediaType type = MediaType.OCTET_STREAM;
// Get type based on magic prefix
if (input != null) {
input.mark(getMinLength());
try {
byte[] prefix = readMagicHeader(input);
- type = getMimeType(prefix);
+ type = getMimeType(prefix).getType();
} finally {
input.reset();
}
@@ -566,8 +562,8 @@ public final class MimeTypes implements
}
if (name != null) {
- MimeType hint = getMimeType(name);
- if (hint.isDescendantOf(type)) {
+ MediaType hint = getMimeType(name).getType();
+ if (registry.isSpecializationOf(hint, type)) {
type = hint;
}
}
@@ -577,8 +573,8 @@ public final class MimeTypes implements
String typeName = metadata.get(Metadata.CONTENT_TYPE);
if (typeName != null) {
try {
- MimeType hint = forName(typeName);
- if (hint.isDescendantOf(type)) {
+ MediaType hint = forName(typeName).getType();
+ if (registry.isSpecializationOf(hint, type)) {
type = hint;
}
} catch (MimeTypeException e) {
@@ -586,7 +582,7 @@ public final class MimeTypes implements
}
}
- return MediaType.parse(type.getName());
+ return type;
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java Fri Jun 18 12:03:08 2010
@@ -154,7 +154,7 @@ final class MimeTypesReader implements M
String alias = nodeElement.getAttribute(ALIAS_TYPE_ATTR);
MediaType aliasType = MediaType.parse(alias);
if (aliasType != null) {
- type.addAlias(aliasType);
+ types.addAlias(type, aliasType);
} else {
throw new MimeTypeException(
"Invalid media type alias: " + alias);
@@ -163,7 +163,7 @@ final class MimeTypesReader implements M
readRootXML(nodeElement, type);
} else if (nodeElement.getTagName().equals(SUB_CLASS_OF_TAG)) {
String parent = nodeElement.getAttribute(SUB_CLASS_TYPE_ATTR);
- type.setSuperType(types.forName(parent));
+ types.setSuperType(type, MediaType.parse(parent));
}
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Patterns.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Patterns.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/Patterns.java Fri Jun 18 12:03:08 2010
@@ -33,6 +33,8 @@ class Patterns implements Serializable {
*/
private static final long serialVersionUID = -5778015347278111140L;
+ private final MediaTypeRegistry registry;
+
/**
* Index of exact name patterns.
*/
@@ -72,6 +74,10 @@ class Patterns implements Serializable {
}
+ public Patterns(MediaTypeRegistry registry) {
+ this.registry = registry;
+ }
+
public void add(String pattern, MimeType type) throws MimeTypeException {
this.add(pattern, false, type);
}
@@ -103,9 +109,11 @@ class Patterns implements Serializable {
private void addName(String name, MimeType type) throws MimeTypeException {
MimeType previous = names.get(name);
- if (previous == null || previous.isDescendantOf(type)) {
+ if (previous == null
+ || registry.isSpecializationOf(previous.getType(), type.getType())) {
names.put(name, type);
- } else if (previous == type || type.isDescendantOf(previous)) {
+ } else if (previous == type
+ || registry.isSpecializationOf(type.getType(), previous.getType())) {
// do nothing
} else {
throw new MimeTypeException("Conflicting name pattern: " + name);
@@ -115,12 +123,14 @@ class Patterns implements Serializable {
private void addExtension(String extension, MimeType type)
throws MimeTypeException {
MimeType previous = extensions.get(extension);
- if (previous == null || previous.isDescendantOf(type)) {
+ if (previous == null
+ || registry.isSpecializationOf(previous.getType(), type.getType())) {
extensions.put(extension, type);
int length = extension.length();
minExtensionLength = Math.min(minExtensionLength, length);
maxExtensionLength = Math.max(maxExtensionLength, length);
- } else if (previous == type || type.isDescendantOf(previous)) {
+ } else if (previous == type
+ || registry.isSpecializationOf(type.getType(), previous.getType())) {
// do nothing
} else {
throw new MimeTypeException(
@@ -131,9 +141,11 @@ class Patterns implements Serializable {
private void addGlob(String glob, MimeType type)
throws MimeTypeException {
MimeType previous = globs.get(glob);
- if (previous == null || previous.isDescendantOf(type)) {
+ if (previous == null
+ || registry.isSpecializationOf(previous.getType(), type.getType())) {
globs.put(glob, type);
- } else if (previous == type || type.isDescendantOf(previous)) {
+ } else if (previous == type
+ || registry.isSpecializationOf(type.getType(), previous.getType())) {
// do nothing
} else {
throw new MimeTypeException("Conflicting glob pattern: " + glob);
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Fri Jun 18 12:03:08 2010
@@ -33,6 +33,11 @@ import org.xml.sax.SAXException;
public class AutoDetectParser extends CompositeParser {
/**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 6110455808615143122L;
+
+ /**
* The type detector used by this parser to auto-detect the type
* of a document.
*/
@@ -47,7 +52,7 @@ public class AutoDetectParser extends Co
}
public AutoDetectParser(Detector detector) {
- setParsers(TikaConfig.getDefaultConfig().getParsers());
+ this(TikaConfig.getDefaultConfig());
setDetector(detector);
}
@@ -58,6 +63,7 @@ public class AutoDetectParser extends Co
public void setConfig(TikaConfig config) {
setParsers(config.getParsers());
setDetector(config.getMimeRepository());
+ setMediaTypeRegistry(config.getMediaTypeRegistry());
}
/**
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java Fri Jun 18 12:03:08 2010
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -26,6 +26,7 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.io.TaggedInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.TaggedContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -39,6 +40,16 @@ import org.xml.sax.SAXException;
public class CompositeParser implements Parser {
/**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 5613173903360405824L;
+
+ /**
+ * Media type registry.
+ */
+ private MediaTypeRegistry registry = new MediaTypeRegistry();
+
+ /**
* Set of component parsers, keyed by the supported media types.
*/
private Map<MediaType, Parser> parsers = new HashMap<MediaType, Parser>();
@@ -49,6 +60,26 @@ public class CompositeParser implements
private Parser fallback = new EmptyParser();
/**
+ * Returns the media type registry used to infer type relationships.
+ *
+ * @since Apache Tika 0.8
+ * @return media type registry
+ */
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return registry;
+ }
+
+ /**
+ * Sets the media type registry used to infer type relationships.
+ *
+ * @since Apache Tika 0.8
+ * @param registry media type registry
+ */
+ public void setMediaTypeRegistry(MediaTypeRegistry registry) {
+ this.registry = registry;
+ }
+
+ /**
* Returns the component parsers.
*
* @return component parsers, keyed by media type
@@ -110,7 +141,7 @@ public class CompositeParser implements
} else {
for (MediaType parserType : parsers.keySet()) {
if (parserType != null
- && type.isSpecializationOf(parserType)) {
+ && registry.isSpecializationOf(type, parserType)) {
return parsers.get(parserType);
}
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java Fri Jun 18 12:03:08 2010
@@ -29,12 +29,14 @@ public class MimeDetectionTest extends T
private MimeTypes mimeTypes;
+ private MediaTypeRegistry registry;
+
/** @inheritDoc */
@Override
protected void setUp() throws Exception {
super.setUp();
this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
- //this.mimeTypes = MimeTypesFactory.create("/org/apache/tika/mime/tika-mimetypes-minimal.xml");
+ this.registry = mimeTypes.getMediaTypeRegistry();
}
public void testDetection() throws Exception {
@@ -74,13 +76,38 @@ public class MimeDetectionTest extends T
new Metadata()));
}
- public void testAutosetSupertype() throws MimeTypeException {
- MimeTypes types = new MimeTypes();
- MimeType type = types.forName("application/something+xml");
- assertEquals("application/xml", type.getSuperType().getName());
-
- type = types.forName("text/something");
- assertEquals("text/plain", type.getSuperType().getName());
+ public void testSuperTypes() {
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.parse("text/something")));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.OCTET_STREAM));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("text/something"),
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/something+xml"),
+ MediaType.APPLICATION_XML));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/something+zip"),
+ MediaType.APPLICATION_ZIP));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.APPLICATION_XML,
+ MediaType.TEXT_PLAIN));
+
+ assertTrue(registry.isSpecializationOf(
+ MediaType.parse("application/vnd.apple.keynote"),
+ MediaType.APPLICATION_ZIP));
}
private void testUrl(String expected, String url, String file) throws IOException{
@@ -92,7 +119,7 @@ public class MimeDetectionTest extends T
InputStream in = getClass().getResourceAsStream(filename);
testStream(expected, filename, in);
}
-
+
private void testStream(String expected, String urlOrFileName, InputStream in) throws IOException{
assertNotNull("Test stream: ["+urlOrFileName+"] is null!", in);
if (!in.markSupported()) {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypeTest.java Fri Jun 18 12:03:08 2010
@@ -30,18 +30,9 @@ public class MimeTypeTest extends TestCa
/** Test MimeType constructor */
public void testConstrctor() {
-
- // Missing registry
- try {
- new MimeType(null, MediaType.TEXT_PLAIN);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
-
// Missing name
try {
- new MimeType(types, null);
+ new MimeType(null);
fail("Expected IllegalArgumentException");
} catch (IllegalArgumentException e) {
// expected result
@@ -81,14 +72,4 @@ public class MimeTypeTest extends TestCa
}
}
- /** Test MimeType setSuperType() */
- public void testSetSuperType() throws MimeTypeException {
- try {
- text.setSuperType(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
- }
-
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/MimeTypesTest.java Fri Jun 18 12:03:08 2010
@@ -16,6 +16,9 @@
*/
package org.apache.tika.mime;
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
+import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
+
import java.io.IOException;
import java.io.InputStream;
@@ -25,6 +28,8 @@ public class MimeTypesTest extends TestC
private MimeTypes types;
+ private MediaTypeRegistry registry;
+
private MimeType binary;
private MimeType text;
@@ -33,11 +38,12 @@ public class MimeTypesTest extends TestC
protected void setUp() throws MimeTypeException {
types = new MimeTypes();
+ registry = types.getMediaTypeRegistry();
binary = types.forName("application/octet-stream");
text = types.forName("text/plain");
- text.addAlias(MediaType.parse("text/x-plain"));
+ types.addAlias(text, MediaType.parse("text/x-plain"));
html = types.forName("text/html");
- html.setSuperType(text);
+ types.setSuperType(html, TEXT_PLAIN);
}
public void testForName() throws MimeTypeException {
@@ -53,52 +59,38 @@ public class MimeTypesTest extends TestC
}
public void testSuperType() throws MimeTypeException {
- assertNull(binary.getSuperType());
- assertEquals(binary, text.getSuperType());
- assertEquals(text, html.getSuperType());
+ assertNull(registry.getSuperType(OCTET_STREAM));
+ assertEquals(OCTET_STREAM, registry.getSuperType(TEXT_PLAIN));
+ assertEquals(TEXT_PLAIN, registry.getSuperType(html.getType()));
}
public void testIsDescendantOf() {
- assertFalse(binary.isDescendantOf(binary));
- assertFalse(text.isDescendantOf(text));
- assertFalse(html.isDescendantOf(html));
-
- assertTrue(text.isDescendantOf(binary));
- assertFalse(binary.isDescendantOf(text));
-
- assertTrue(html.isDescendantOf(binary));
- assertFalse(binary.isDescendantOf(html));
+ assertFalse(registry.isSpecializationOf(OCTET_STREAM, OCTET_STREAM));
+ assertFalse(registry.isSpecializationOf(TEXT_PLAIN, TEXT_PLAIN));
+ assertFalse(registry.isSpecializationOf(html.getType(), html.getType()));
- assertTrue(html.isDescendantOf(text));
- assertFalse(text.isDescendantOf(html));
+ assertTrue(registry.isSpecializationOf(html.getType(), OCTET_STREAM));
+ assertFalse(registry.isSpecializationOf(OCTET_STREAM, html.getType()));
- try {
- binary.isDescendantOf(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
+ assertTrue(registry.isSpecializationOf(html.getType(), TEXT_PLAIN));
+ assertFalse(registry.isSpecializationOf(TEXT_PLAIN, html.getType()));
+
+ assertTrue(registry.isSpecializationOf(TEXT_PLAIN, OCTET_STREAM));
+ assertFalse(registry.isSpecializationOf(OCTET_STREAM, TEXT_PLAIN));
}
public void testCompareTo() {
assertTrue(binary.compareTo(binary) == 0);
- assertTrue(binary.compareTo(text) < 0);
- assertTrue(binary.compareTo(html) < 0);
+ assertTrue(binary.compareTo(text) != 0);
+ assertTrue(binary.compareTo(html) != 0);
- assertTrue(text.compareTo(binary) > 0);
+ assertTrue(text.compareTo(binary) != 0);
assertTrue(text.compareTo(text) == 0);
- assertTrue(text.compareTo(html) < 0);
+ assertTrue(text.compareTo(html) != 0);
- assertTrue(html.compareTo(binary) > 0);
- assertTrue(html.compareTo(text) > 0);
+ assertTrue(html.compareTo(binary) != 0);
+ assertTrue(html.compareTo(text) != 0);
assertTrue(html.compareTo(html) == 0);
-
- try {
- binary.compareTo(null);
- fail("Expected IllegalArgumentException");
- } catch (IllegalArgumentException e) {
- // expected result
- }
}
/** Test getMimeType(byte[]) */
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/PatternsTest.java Fri Jun 18 12:03:08 2010
@@ -16,9 +16,6 @@
*/
package org.apache.tika.mime;
-import java.io.IOException;
-import java.io.InputStream;
-
import junit.framework.TestCase;
/**
@@ -33,7 +30,7 @@ public class PatternsTest extends TestCa
private MimeType text;
protected void setUp() throws MimeTypeException {
- patterns = new Patterns();
+ patterns = new Patterns(new MediaTypeRegistry());
types = new MimeTypes();
text = types.forName("text/plain");
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=955959&r1=955958&r2=955959&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/mime/TestMimeTypes.java Fri Jun 18 12:03:08 2010
@@ -257,7 +257,7 @@ public class TestMimeTypes extends TestC
* @since TIKA-194
*/
public void testJavaRegex() throws Exception{
- MimeType testType = new MimeType(this.repo, MediaType.parse("foo/bar"));
+ MimeType testType = new MimeType(MediaType.parse("foo/bar"));
this.repo.add(testType);
assertNotNull(repo.forName("foo/bar"));
String pattern = "rtg_sst_grb_0\\.5\\.\\d{8}";
@@ -265,8 +265,8 @@ public class TestMimeTypes extends TestC
String testFileName = "rtg_sst_grb_0.5.12345678";
assertNotNull(this.repo.getMimeType(testFileName));
assertEquals(this.repo.getMimeType(testFileName).getName(), "foo/bar");
-
- MimeType testType2 = new MimeType(this.repo, MediaType.parse("foo/bar2"));
+
+ MimeType testType2 = new MimeType(MediaType.parse("foo/bar2"));
this.repo.add(testType2);
assertNotNull(repo.forName("foo/bar2"));
this.repo.addPattern(testType2, pattern, false);