You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/11/04 10:12:32 UTC

svn commit: r591743 - in /incubator/tika/trunk/src: main/java/org/apache/tika/mime/ main/java/org/apache/tika/parser/ main/resources/mime/ test/java/org/apache/tika/mime/

Author: jukka
Date: Sun Nov  4 01:12:30 2007
New Revision: 591743

URL: http://svn.apache.org/viewvc?rev=591743&view=rev
Log:
TIKA-87 - MimeTypes should allow modification of MIME types
    - MimeType.addAlias(String) can now be used to add new aliases
    - MimeType.addPattern(String) can now be used to add new patterns
    - MimeTypes.forName(String) validates the name
    - MimeTypes.forName(String) creates and registers the type if needed
    - Simplified type name handling and validation
    - New test cases

Added:
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java   (with props)
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java   (with props)
Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Sun Nov  4 01:12:30 2007
@@ -18,53 +18,82 @@
 
 // JDK imports
 import java.util.ArrayList;
-import java.util.Collections;
+import java.util.SortedSet;
+import java.util.TreeSet;
 import java.util.regex.Pattern;
 
 import org.apache.tika.utils.StringUtil;
 
 /**
- * Defines a Mime Content Type.
- * 
- * 
+ * Internet media type.
  */
 public final class MimeType implements Comparable<MimeType> {
 
-    /** The primary and sub types separator */
-    private final static String SEPARATOR = "/";
-
-    /** The parameters separator */
-    private final static String PARAMS_SEP = ";";
-
-    /** Special characters not allowed in content types. */
-    private final static String SPECIALS = "()<>@,;:\\\"/[]?=";
-
-    /** The Mime-Type full name */
-    private String name = null;
+    /**
+     * Checks that the given string is a valid Internet media type name
+     * based on rules from RFC 2054 section 5.3. For validation purposes the
+     * rules can be simplified to the following:
+     * <pre>
+     * name := token "/" token
+     * token := 1*&lt;any (US-ASCII) CHAR except SPACE, CTLs, or tspecials&gt;
+     * tspecials :=  "(" / ")" / "&lt;" / "&gt;" / "@" / "," / ";" / ":" /
+     *               "\" / <"> / "/" / "[" / "]" / "?" / "="
+     * </pre>
+     *
+     * @param name name string
+     * @return <code>true</code> if the string is a valid media type name,
+     *         <code>false</code> otherwise
+     */
+    public static boolean isValid(String name) {
+        assert name != null;
+
+        boolean slash = false;
+        for (int i = 0; i < name.length(); i++) {
+            char ch = name.charAt(i);
+            if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' ||
+                    ch == '<' || ch == '>' || ch == '@' || ch == ',' ||
+                    ch == ';' || ch == ':' || ch == '\\' || ch == '"' ||
+                    ch == '[' || ch == ']' || ch == '?' || ch == '=') {
+                return false;
+            } else if (ch == '/') {
+                if (slash || i == 0 || i + 1 == name.length()) {
+                    return false;
+                }
+                slash = true;
+            }
+        }
+        return slash;
+    }
 
-    /** The Mime-Type primary type */
-    private String primary = null;
+    /**
+     * The media type registry that contains this type.
+     */
+    private final MimeTypes registry;
 
-    /** The Mime-Type sub type */
-    private String sub = null;
+    /**
+     * Lower case name of this media type.
+     */
+    private final String name;
 
     /** The Mime-Type description */
     private String description = null;
 
     /** The Mime-Type associated recognition patterns */
-    private Patterns patterns = null;
+    private final Patterns patterns = new Patterns();
 
     /** The magics associated to this Mime-Type */
-    private ArrayList<Magic> magics = null;
+    private final ArrayList<Magic> magics = new ArrayList<Magic>();
 
-    /** The aliases Mime-Types for this one */
-    private ArrayList<String> aliases = null;
+    /**
+     * Lower case alias names of this media type.
+     */
+    private final SortedSet<String> aliases = new TreeSet<String>();
 
     /** The root-XML associated to this Mime-Type */
-    private ArrayList<RootXML> rootXML = null;
+    private final ArrayList<RootXML> rootXML = new ArrayList<RootXML>();
 
     /** The sub-class-of associated to this Mime-Type */
-    private ArrayList<String> superTypes = null;
+    private final ArrayList<String> superTypes = new ArrayList<String>();
 
     /** The mime-type level (regarding its subTypes) */
     private int level = 0;
@@ -73,167 +102,32 @@
     private int minLength = 0;
 
     /**
-     * Creates a MimeType from a String.
-     * 
-     * @param name
-     *            the MIME content type String.
-     */
-    public MimeType(String name) throws MimeTypeException {
-
-        if (name == null || name.length() <= 0) {
-            throw new MimeTypeException("The type can not be null or empty");
-        }
-
-        // Split the two parts of the Mime Content Type
-        String[] parts = name.split(SEPARATOR, 2);
-
-        // Checks validity of the parts
-        if (parts.length != 2) {
-            throw new MimeTypeException("Invalid Content Type " + name);
-        }
-        init(parts[0], parts[1]);
+     * Creates a media type with the give name and containing media type
+     * registry. The name is expected to be valid and normalized to lower
+     * case. This constructor should only be called by
+     * {@link MimeTypes#forName(String)} to keep the media type mapping
+     * up to date.
+     *
+     * @param registry the media type registry that contains this type
+     * @param name media type name
+     */
+    MimeType(MimeTypes registry, String name) {
+        assert registry != null;
+        assert isValid(name) && name.equals(name.toLowerCase());
+        this.registry = registry;
+        this.name = name;
     }
 
     /**
-     * Creates a MimeType with the given primary type and sub type.
+     * Returns the name of this Internet media type.
      * 
-     * @param primary
-     *            the content type primary type.
-     * @param sub
-     *            the content type sub type.
-     */
-    public MimeType(String primary, String sub) throws MimeTypeException {
-        init(primary, sub);
-    }
-
-    /** Init method used by constructors. */
-    private void init(String primary, String sub) throws MimeTypeException {
-
-        // Preliminary checks...
-        if ((primary == null) || (primary.length() <= 0) || (!isValid(primary))) {
-            throw new MimeTypeException("Invalid Primary Type " + primary);
-        }
-        // Remove optional parameters from the sub type
-        String clearedSub = null;
-        if (sub != null) {
-            clearedSub = sub.split(PARAMS_SEP)[0];
-        }
-        if ((clearedSub == null) || (clearedSub.length() <= 0)
-                || (!isValid(clearedSub))) {
-            throw new MimeTypeException("Invalid Sub Type " + clearedSub);
-        }
-
-        // All is ok, assign values
-        this.primary = primary.toLowerCase().trim();
-        this.sub = clearedSub.toLowerCase().trim();
-        this.name = this.primary + SEPARATOR + this.sub;
-        this.patterns = new Patterns();
-        this.magics = new ArrayList<Magic>();
-        this.aliases = new ArrayList<String>();
-        this.rootXML = new ArrayList<RootXML>();
-        this.superTypes = new ArrayList<String>();
-    }
-
-    /**
-     * Cleans a content-type. This method cleans a content-type by removing its
-     * optional parameters and returning only its
-     * <code>primary-type/sub-type</code>.
-     * 
-     * @param type
-     *            is the content-type to clean.
-     * @return the cleaned version of the specified content-type.
-     * @throws MimeTypeException
-     *             if something wrong occurs during the parsing/cleaning of the
-     *             specified type.
-     */
-    public final static String clean(String type) throws MimeTypeException {
-        return (new MimeType(type)).getName();
-    }
-
-    /**
-     * Return the name of this mime-type.
-     * 
-     * @return the name of this mime-type.
+     * @return media type name (lower case)
      */
     public String getName() {
         return name;
     }
 
     /**
-     * Return the primary type of this mime-type.
-     * 
-     * @return the primary type of this mime-type.
-     */
-    public String getPrimaryType() {
-        return primary;
-    }
-
-    /**
-     * Return the sub type of this mime-type.
-     * 
-     * @return the sub type of this mime-type.
-     */
-    public String getSubType() {
-        return sub;
-    }
-
-    // Inherited Javadoc
-    public String toString() {
-        StringBuffer buf = new StringBuffer();
-        buf.append(name).append(" -- ").append(getDescription()).append("\n")
-                .append("Aliases: ");
-        if (aliases.size() < 1) {
-            buf.append(" NONE");
-        }
-        buf.append("\n");
-        for (int i = 0; i < aliases.size(); i++) {
-            buf.append("\t").append(aliases.get(i)).append("\n");
-        }
-        buf.append("Patterns:");
-        String[] patterns = this.patterns.getPatterns();
-        if (patterns.length < 1) {
-            buf.append(" NONE");
-        }
-        buf.append("\n");
-        for (int i = 0; i < patterns.length; i++) {
-            buf.append("\t").append(patterns[i]).append("\n");
-        }
-        buf.append("Magics:  ");
-        if (magics.size() < 1) {
-            buf.append(" NONE");
-        }
-        buf.append("\n");
-        for (int i = 0; i < magics.size(); i++) {
-            buf.append("\t").append(magics.get(i)).append("\n");
-        }
-
-        return buf.toString();
-    }
-
-    /**
-     * Indicates if an object is equal to this mime-type. The specified object
-     * is equal to this mime-type if it is not null, and it is an instance of
-     * MimeType and its name is equals to this mime-type.
-     * 
-     * @param object
-     *            the reference object with which to compare.
-     * @return <code>true</code> if this mime-type is equal to the object
-     *         argument; <code>false</code> otherwise.
-     */
-    public boolean equals(Object object) {
-        try {
-            return ((MimeType) object).getName().equals(this.name);
-        } catch (Exception e) {
-            return false;
-        }
-    }
-
-    // Inherited Javadoc
-    public int hashCode() {
-        return name.hashCode();
-    }
-
-    /**
      * Return the description of this mime-type.
      * 
      * @return the description of this mime-type.
@@ -253,32 +147,51 @@
     }
 
     /**
-     * Add a supported file-naming pattern.
-     * 
-     * @param pattern
-     *            to add to the list of recognition pattern for this mime-type.
+     * Adds a file name pattern for this media type.
+     *
+     * @param pattern file name pattern
      */
-    void addPattern(String pattern) {
+    public synchronized void addPattern(String pattern) {
+        registry.addPattern(this, pattern);
         patterns.add(pattern, this);
     }
 
     /**
-     * Return the recogition patterns for this mime-type
+     * Returns the file name patterns for this media type.
      * 
-     * @return the recoginition patterns associated to this mime-type.
+     * @return file name patterns
      */
-    String[] getPatterns() {
+    public synchronized String[] getPatterns() {
         return patterns.getPatterns();
     }
 
     /**
-     * Add an alias to this mime-type
-     * 
-     * @param alias
-     *            to add to this mime-type.
-     */
-    void addAlias(String alias) {
-        aliases.add(alias);
+     * Returns the aliases of this media type. The returned set is
+     * newly allocated and can be freely modified by the client.
+     *
+     * @return media type aliases
+     */
+    public synchronized SortedSet<String> getAliases() {
+        return new TreeSet<String>(aliases);
+    }
+
+    /**
+     * Adds an alias name for this media type.
+     *
+     * @param alias media type alias (case insensitive)
+     * @throws MimeTypeException if the alias is invalid
+     *                           or already registered for another media type
+     */
+    public synchronized void addAlias(String alias) throws MimeTypeException {
+        if (isValid(alias)) {
+            alias = alias.toLowerCase();
+            if (!name.equals(alias) && !aliases.contains(alias)) {
+                registry.addAlias(this, alias);
+                aliases.add(alias);
+            }
+        } else {
+            throw new MimeTypeException("Invalid media type alias: " + alias);
+        }
     }
 
     /**
@@ -336,15 +249,6 @@
         this.level = level;
     }
 
-    /**
-     * Return the recogition patterns for this mime-type
-     * 
-     * @return the recoginition patterns associated to this mime-type.
-     */
-    public String[] getAliases() {
-        return aliases.toArray(new String[aliases.size()]);
-    }
-
     Magic[] getMagics() {
         return magics.toArray(new Magic[magics.size()]);
     }
@@ -382,26 +286,6 @@
         return matchesXML(data) || matchesMagic(data);
     }
 
-    /** Checks if the specified primary or sub type is valid. */
-    private boolean isValid(String type) {
-        return (type != null) && (type.trim().length() > 0)
-                && !hasCtrlOrSpecials(type);
-    }
-
-    /** Checks if the specified string contains some special characters. */
-    private boolean hasCtrlOrSpecials(String type) {
-        int len = type.length();
-        int i = 0;
-        while (i < len) {
-            char c = type.charAt(i);
-            if (c <= '\032' || SPECIALS.indexOf(c) > 0) {
-                return true;
-            }
-            i++;
-        }
-        return false;
-    }
-
     /**
      * Defines a RootXML description. RootXML is made of a localName and/or a
      * namespaceURI.
@@ -468,12 +352,25 @@
         }
     }
 
+    //----------------------------------------------------------< Comparable >
+
     public int compareTo(MimeType o) {
         int diff = level - o.level;
         if (diff == 0) {
             diff = name.compareTo(o.name);
         }
         return diff;
+    }
+
+    //--------------------------------------------------------------< Object >
+
+    /**
+     * Returns the name of this Internet media type.
+     *
+     * @return media type name
+     */
+    public String toString() {
+        return name;
     }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Nov  4 01:12:30 2007
@@ -48,8 +48,10 @@
     /** The default <code>application/octet-stream</code> MimeType */
     public final static String DEFAULT = "application/octet-stream";
 
+    private final MimeType root;
+
     /** All the registered MimeTypes indexed on their name */
-    private Map<String, MimeType> types = new HashMap<String, MimeType>();
+    private final Map<String, MimeType> types = new HashMap<String, MimeType>();
 
     /** The patterns matcher */
     private Patterns patterns = new Patterns();
@@ -66,6 +68,11 @@
     /** The minimum length of data to provide to check all MimeTypes */
     private int minLength = 0;
 
+    public MimeTypes() {
+        root = new MimeType(this, DEFAULT);
+        types.put(root.getName(), root);
+    }
+
     /**
      * Find the Mime Content Type of a file.
      * 
@@ -103,7 +110,7 @@
         if (type != null)
             return type;
         // if it's null here, then return the default type
-        return forName(DEFAULT);
+        return root;
     }
 
     /**
@@ -183,30 +190,21 @@
     }
 
     public String getType(String typeName, String url, byte[] data) {
-        MimeType type = null;
-        try {
-            typeName = MimeType.clean(typeName);
-            type = typeName == null ? null : forName(typeName);
-        } catch (MimeTypeException mte) {
-            // Seems to be a malformed mime type name...
-        }
-
-        if (typeName == null || type == null || !type.matches(url)) {
-            // If no mime-type header, or cannot find a corresponding registered
-            // mime-type, or the one found doesn't match the url pattern
-            // it shouldbe, then guess a mime-type from the url pattern
-            type = getMimeType(url);
-            typeName = type == null ? typeName : type.getName();
-        }
-        // if (typeName == null || type == null ||
-        // (this.magic && type.hasMagic() && !type.matches(data))) {
-        // If no mime-type already found, or the one found doesn't match
-        // the magic bytes it should be, then, guess a mime-type from the
-        // document content (magic bytes)
-        type = getMimeType(data);
-        typeName = type == null ? typeName : type.getName();
-        // }
-        return typeName;
+        MimeType type = getMimeType(url, data);
+
+        if (type == null && typeName != null) {
+            try {
+                type = forName(typeName);
+            } catch (MimeTypeException e) {
+                // Invalid type name hint
+            }
+        }
+
+        if (type == null) {
+            type = root;
+        }
+
+        return type.getName();
     }
 
     /**
@@ -274,15 +272,56 @@
     }
 
     /**
-     * Find a Mime Content Type from its name.
-     * 
-     * @param name
-     *            is the content type name
-     * @return the MimeType for the specified name, or <code>null</code> if no
-     *         MimeType is registered for this name.
+     * Returns the registered media type with the given name (or alias).
+     * The named media type is automatically registered (and returned) if
+     * it doesn't already exist.
+     *
+     * @param name media type name (case-insensitive)
+     * @return the registered media type with the given name or alias
+     * @throws MimeTypeException if the given media type name is invalid
+     */
+    public synchronized MimeType forName(String name)
+            throws MimeTypeException {
+        if (MimeType.isValid(name)) {
+            name = name.toLowerCase();
+            MimeType type = types.get(name);
+            if (type == null) {
+                type = new MimeType(this, name);
+                types.put(name, type);
+            }
+            return type;
+        } else {
+            throw new MimeTypeException("Invalid media type name: " + name);
+        }
+    }
+
+    /**
+     * Adds an alias for the given media type. This method should only
+     * be called from {@link MimeType#addAlias(String)}.
+     *
+     * @param type media type
+     * @param alias media type alias (normalized to lower case)
+     * @throws MimeTypeException if the alias already exists
+     */
+    synchronized void addAlias(MimeType type, String alias)
+            throws MimeTypeException {
+        if (!types.containsKey(alias)) {
+            types.put(alias, type);
+        } else {
+            throw new MimeTypeException(
+                    "Media type alias already exists: " + alias);
+        }
+    }
+
+    /**
+     * Adds a file name pattern for the given media type. This method should
+     * only be called from {@link MimeType#addPattern(String)}.
+     *
+     * @param type media type
+     * @param pattern file name pattern
      */
-    public MimeType forName(String name) {
-        return types.get(name);
+    synchronized void addPattern(MimeType type, String pattern) {
+        patterns.add(pattern, type);
     }
 
     /**
@@ -351,8 +390,6 @@
 
         // Update minLentgth
         minLength = Math.max(minLength, type.getMinLength());
-        // Update the extensions index...
-        patterns.add(type.getPatterns(), type);
         // Update the magics index...
         if (type.hasMagic()) {
             magics.addAll(Arrays.asList(type.getMagics()));

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Sun Nov  4 01:12:30 2007
@@ -130,79 +130,60 @@
     void read(Document document) {
         Element element = document.getDocumentElement();
         if (element != null && element.getTagName().equals("mime-info")) {
-            readMimeInfo(element);
-        }
-    }
-
-    /** Read Element named mime-info. */
-    private MimeType[] readMimeInfo(Element element) {
-        ArrayList<MimeType> types = new ArrayList<MimeType>();
-        NodeList nodes = element.getChildNodes();
-        for (int i = 0; i < nodes.getLength(); i++) {
-            Node node = nodes.item(i);
-            if (node.getNodeType() == Node.ELEMENT_NODE) {
-                Element nodeElement = (Element) node;
-                if (nodeElement.getTagName().equals("mime-type")) {
-                    readMimeType(nodeElement);
+            NodeList nodes = element.getChildNodes();
+            for (int i = 0; i < nodes.getLength(); i++) {
+                Node node = nodes.item(i);
+                if (node.getNodeType() == Node.ELEMENT_NODE) {
+                    Element child = (Element) node;
+                    if (child.getTagName().equals("mime-type")) {
+                        readMimeType(child);
+                    }
                 }
             }
+        } else {
+            logger.warn("Not a <mime-info/> configuration document");
         }
-        return types.toArray(new MimeType[types.size()]);
     }
 
     /** Read Element named mime-type. */
     private void readMimeType(Element element) {
-
-        MimeType type = null;
-
+        String name = element.getAttribute("type");
         try {
-            type = new MimeType(element.getAttribute("type"));
-        } catch (MimeTypeException mte) {
-            // Mime Type not valid... just ignore it
-            if (logger.isInfoEnabled()) {
-                logger.info(mte.toString() + " ... Ignoring!");
-            }
-            return;
-        }
+            MimeType type = types.forName(name);
 
-        NodeList nodes = element.getChildNodes();
-        for (int i = 0; i < nodes.getLength(); i++) {
-            Node node = nodes.item(i);
-            if (node.getNodeType() == Node.ELEMENT_NODE) {
-                Element nodeElement = (Element) node;
-                if (nodeElement.getTagName().equals("_comment")) {
-                    type.setDescription(nodeElement.getFirstChild()
-                            .getNodeValue());
-                } else if (nodeElement.getTagName().equals("glob")) {
-                    readGlob(nodeElement, type);
-                } else if (nodeElement.getTagName().equals("magic")) {
-                    readMagic(nodeElement, type);
-                } else if (nodeElement.getTagName().equals("alias")) {
-                    readAlias(nodeElement, type);
-                } else if (nodeElement.getTagName().equals("root-XML")) {
-                    readRootXML(nodeElement, type);
-                } else if (nodeElement.getTagName().equals("sub-class-of")) {
-                    readSubClassOf(nodeElement, type);
+            NodeList nodes = element.getChildNodes();
+            for (int i = 0; i < nodes.getLength(); i++) {
+                Node node = nodes.item(i);
+                if (node.getNodeType() == Node.ELEMENT_NODE) {
+                    Element nodeElement = (Element) node;
+                    if (nodeElement.getTagName().equals("_comment")) {
+                        type.setDescription(
+                                nodeElement.getFirstChild().getNodeValue());
+                    } else if (nodeElement.getTagName().equals("glob")) {
+                        type.addPattern(nodeElement.getAttribute("pattern"));
+                    } else if (nodeElement.getTagName().equals("magic")) {
+                        readMagic(nodeElement, type);
+                    } else if (nodeElement.getTagName().equals("alias")) {
+                        String alias = nodeElement.getAttribute("type");
+                        try {
+                            type.addAlias(alias);
+                        } catch (MimeTypeException e) {
+                            logger.warn("Invalid media type alias: " + alias, e);
+                        }
+                    } else if (nodeElement.getTagName().equals("root-XML")) {
+                        readRootXML(nodeElement, type);
+                    } else if (nodeElement.getTagName().equals("sub-class-of")) {
+                        readSubClassOf(nodeElement, type);
+                    }
                 }
             }
+        } catch (MimeTypeException e) {
+            logger.warn("Invalid media type configuration entry: " + name, e);
         }
-
-        types.add(type);
-    }
-
-    /** Read Element named glob. */
-    private void readGlob(Element element, MimeType type) {
-        type.addPattern(element.getAttribute("pattern"));
-    }
-
-    /** Read Element named alias. */
-    private void readAlias(Element element, MimeType type) {
-        type.addAlias(element.getAttribute("type"));
     }
 
     /** Read Element named magic. */
     private void readMagic(Element element, MimeType mimeType) {
-
         Magic magic = null;
         try {
             magic = new Magic(Integer
@@ -296,83 +277,6 @@
     private void readSubClassOf(Element element, MimeType mimeType) {
 
         mimeType.addSuperType(element.getAttribute("type"));
-    }
-
-    /** Prints the specified node, then prints all of its children. */
-    public static void printDOM(Node node) {
-        int type = node.getNodeType();
-        switch (type) {
-        // print the document element
-        case Node.DOCUMENT_NODE: {
-            System.out.println("&lt;?xml version=\"1.0\" ?>");
-            printDOM(((Document) node).getDocumentElement());
-            break;
-        }
-
-            // print element with attributes
-        case Node.ELEMENT_NODE: {
-            System.out.print("<");
-            System.out.print(node.getNodeName());
-            NamedNodeMap attrs = node.getAttributes();
-            for (int i = 0; i < attrs.getLength(); i++) {
-                Node attr = attrs.item(i);
-                System.out.print(" " + attr.getNodeName().trim() + "=\""
-                        + attr.getNodeValue().trim() + "\"");
-            }
-            System.out.println(">");
-
-            NodeList children = node.getChildNodes();
-            if (children != null) {
-                int len = children.getLength();
-                for (int i = 0; i < len; i++)
-                    printDOM(children.item(i));
-            }
-
-            break;
-        }
-
-            // handle entity reference nodes
-        case Node.ENTITY_REFERENCE_NODE: {
-            System.out.print("&");
-            System.out.print(node.getNodeName().trim());
-            System.out.print(";");
-            break;
-        }
-
-            // print cdata sections
-        case Node.CDATA_SECTION_NODE: {
-            System.out.print("<![CDATA[");
-            System.out.print(node.getNodeValue().trim());
-            System.out.print("]]>");
-            break;
-        }
-
-            // print text
-        case Node.TEXT_NODE: {
-            System.out.print(node.getNodeValue().trim());
-            break;
-        }
-
-            // print processing instruction
-        case Node.PROCESSING_INSTRUCTION_NODE: {
-            System.out.print("<?");
-            System.out.print(node.getNodeName().trim());
-            String data = node.getNodeValue().trim();
-            {
-                System.out.print(" ");
-                System.out.print(data);
-            }
-            System.out.print("?>");
-            break;
-        }
-        }
-
-        if (type == Node.ELEMENT_NODE) {
-            System.out.println();
-            System.out.print("</");
-            System.out.print(node.getNodeName().trim());
-            System.out.print('>');
-        }
     }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Nov  4 01:12:30 2007
@@ -110,7 +110,6 @@
         String typename = metadata.get(Metadata.CONTENT_TYPE);
         if (typename != null) {
             try {
-                typename = MimeType.clean(typename);
                 type = types.forName(typename);
             } catch (MimeTypeException e) {
                 // Malformed type name, ignore
@@ -140,7 +139,11 @@
 
         // Finally, use the default type if no matches found
         if (type == null) {
-            type = types.forName(MimeTypes.DEFAULT);
+            try {
+                type = types.forName(MimeTypes.DEFAULT);
+            } catch (MimeTypeException e) {
+                // Should never happen
+            }
         }
 
         return type;

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Nov  4 01:12:30 2007
@@ -523,7 +523,7 @@
 		</mime-type> -->
 
 	<mime-type type="application/x-ms-dos-executable">
-		<alias type="application/x-dosexec;exe" />
+		<alias type="application/x-dosexec" />
 	</mime-type>
 
 	<mime-type type="application/ogg">

Added: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java?rev=591743&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java Sun Nov  4 01:12:30 2007
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import junit.framework.TestCase;
+
+public class MimeTypeTest extends TestCase {
+
+    public void testIsValidName() {
+        assertTrue(MimeType.isValid("application/octet-stream"));
+        assertTrue(MimeType.isValid("text/plain"));
+        assertTrue(MimeType.isValid("foo/bar"));
+        assertTrue(MimeType.isValid("a/b"));
+
+        assertFalse(MimeType.isValid("application"));
+        assertFalse(MimeType.isValid("application/"));
+        assertFalse(MimeType.isValid("/"));
+        assertFalse(MimeType.isValid("/octet-stream"));
+        assertFalse(MimeType.isValid("application//octet-stream"));
+        assertFalse(MimeType.isValid("application/octet=stream"));
+        assertFalse(MimeType.isValid("application/\u00f6ctet-stream"));
+        assertFalse(MimeType.isValid("text/plain;"));
+        assertFalse(MimeType.isValid("text/plain; charset=UTF-8"));
+    }
+
+}

Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java?rev=591743&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java Sun Nov  4 01:12:30 2007
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import junit.framework.TestCase;
+
+public class MimeTypesTest extends TestCase {
+
+    private MimeTypes types;
+
+    protected void setUp() {
+        types = new MimeTypes();
+    }
+
+    public void testForName() throws MimeTypeException {
+        assertNotNull(types.forName("text/plain"));
+        assertEquals("text/plain", types.forName("text/plain").getName());
+        assertEquals("text/plain", types.forName("TEXT/PLAIN").getName());
+
+        try {
+            types.forName("invalid");
+            fail("MimeTypeException not thrown on invalid type name");
+        } catch (MimeTypeException e) {
+            // expected
+        }
+    }
+
+    public void addAlias() throws MimeTypeException {
+        types.addAlias(types.forName("text/plain"), "foo/bar");
+        assertNotNull(types.forName("foo/bar"));
+        assertEquals("text/plain", types.forName("foo/bar").getName());
+
+        try {
+            types.addAlias(types.forName("text/plain"), "invalid");
+            fail("MimeTypeException not thrown on invalid alias name");
+        } catch (MimeTypeException e) {
+            // expected
+        }
+    }
+}

Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java Sun Nov  4 01:12:30 2007
@@ -68,7 +68,7 @@
         assertEquals(repo.getMimeType("test.pdF"), type);
     }
 
-    public void testLoadMimeTypes() {
+    public void testLoadMimeTypes() throws MimeTypeException {
         assertNotNull(repo.forName("application/octet-stream"));
         assertNotNull(repo.forName("text/x-tex"));
     }
@@ -136,6 +136,5 @@
 
         return type;
     }
-
 
 }