You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/11/06 12:00:39 UTC

svn commit: r592371 - in /incubator/tika/trunk/src: main/java/org/apache/tika/mime/ main/java/org/apache/tika/parser/ main/resources/mime/ test/java/org/apache/tika/parser/

Author: jukka
Date: Tue Nov  6 03:00:38 2007
New Revision: 592371

URL: http://svn.apache.org/viewvc?rev=592371&view=rev
Log:
TIKA-87 - MimeTypes should allow modification of MIME types
    - Streamlined pattern handling

Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
    incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
    incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Tue Nov  6 03:00:38 2007
@@ -92,9 +92,6 @@
      */
     private final SortedSet<MimeType> subTypes = new TreeSet<MimeType>();
 
-    /** The Mime-Type associated recognition patterns */
-    private final Patterns patterns = new Patterns();
-
     /** The magics associated to this Mime-Type */
     private final ArrayList<Magic> magics = new ArrayList<Magic>();
 
@@ -203,25 +200,6 @@
     }
 
     /**
-     * Adds a file name pattern for this media type.
-     *
-     * @param pattern file name pattern
-     */
-    public void addPattern(String pattern) {
-        registry.addPattern(this, pattern);
-        patterns.add(pattern, this);
-    }
-
-    /**
-     * Returns the file name patterns for this media type.
-     * 
-     * @return file name patterns
-     */
-    public String[] getPatterns() {
-        return patterns.getPatterns();
-    }
-
-    /**
      * Returns the aliases of this media type. The returned set is
      * newly allocated and can be freely modified by the client.
      *
@@ -297,10 +275,6 @@
 
     public boolean hasMagic() {
         return (magics.size() > 0);
-    }
-
-    public boolean matches(String url) {
-        return (patterns.matches(url) == this);
     }
 
     public boolean matchesMagic(byte[] data) {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Tue Nov  6 03:00:38 2007
@@ -94,15 +94,19 @@
      * 
      * @param name
      *            of the document to analyze.
-     * @return the Mime Content Type of the specified document name, or
-     *         <code>null</code> if none is found.
+     * @return the Mime Content Type of the specified document name
      */
     public MimeType getMimeType(String name) {
-        MimeType type = patterns.matches(name.toLowerCase());
-        if (type != null)
+        MimeType type = patterns.matches(name);
+        if (type != null) {
+            return type;
+        }
+        type = patterns.matches(name.toLowerCase());
+        if (type != null) {
             return type;
-        // if it's null here, then return the default type
-        return root;
+        } else {
+            return root;
+        }
     }
 
     /**
@@ -307,13 +311,14 @@
     }
 
     /**
-     * Adds a file name pattern for the given media type. This method should
-     * only be called from {@link MimeType#addPattern(String)}.
+     * Adds a file name pattern for the given media type.
      *
      * @param type media type
      * @param pattern file name pattern
+     * @throws MimeTypeException if the pattern conflicts with existing ones
      */
-    void addPattern(MimeType type, String pattern) {
+    public void addPattern(MimeType type, String pattern)
+            throws MimeTypeException {
         patterns.add(pattern, type);
     }
 
@@ -328,21 +333,6 @@
     public int getMinLength() {
         return 1024;
         // return minLength;
-    }
-
-    /**
-     * Add the specified mime-types in the repository.
-     * 
-     * @param types
-     *            are the mime-types to add.
-     */
-    void add(MimeType[] types) {
-        if (types == null) {
-            return;
-        }
-        for (int i = 0; i < types.length; i++) {
-            add(types[i]);
-        }
     }
 
     /**

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Tue Nov  6 03:00:38 2007
@@ -159,7 +159,7 @@
                         type.setDescription(
                                 nodeElement.getFirstChild().getNodeValue());
                     } else if (nodeElement.getTagName().equals("glob")) {
-                        type.addPattern(nodeElement.getAttribute("pattern"));
+                        types.addPattern(type, nodeElement.getAttribute("pattern"));
                     } else if (nodeElement.getTagName().equals("magic")) {
                         readMagic(nodeElement, type);
                     } else if (nodeElement.getTagName().equals("alias")) {

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Tue Nov  6 03:00:38 2007
@@ -17,80 +17,100 @@
 package org.apache.tika.mime;
 
 // JDK imports
-import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
 
 /**
  * Defines a MimeType pattern.
  */
 class Patterns {
 
-    private static Map<Character, String> escapeMap =
-        new HashMap<Character, String>();
+    /**
+     * Index of exact name patterns.
+     */
+    private final Map<String, MimeType> names = new HashMap<String, MimeType>();
 
-    static {
-        escapeMap.put('\\', "\\\\");
-        escapeMap.put('?', "\\?");
-        escapeMap.put('[', "\\[");
-        escapeMap.put(']', "\\]");
-        escapeMap.put('^', "\\^");
-        escapeMap.put('.', "\\.");
-        escapeMap.put('-', "\\-");
-        escapeMap.put('$', "\\$");
-        escapeMap.put('+', "\\+");
-        escapeMap.put('(', "\\(");
-        escapeMap.put(')', "\\)");
-        escapeMap.put('{', "\\{");
-        escapeMap.put('}', "\\}");
-        escapeMap.put('|', "\\|");
-        escapeMap.put('*', ".*");
-    }
+    /**
+     * Index of extension patterns of the form "*extension".
+     */
+    private final Map<String, MimeType> extensions =
+        new HashMap<String, MimeType>();
 
-    /** Gathers all the patterns */
-    private ArrayList<String> patterns = new ArrayList<String>();
+    private int minExtensionLength = Integer.MAX_VALUE;
 
-    /** An index of exact matching patterns */
-    private Map<String, MimeType> exactIdx = new HashMap<String, MimeType>();
+    private int maxExtensionLength = 0;
 
-    /** An index of the patterns of the form "*.ext" */
-    private Map<String, MimeType> extIdx = new HashMap<String, MimeType>();
+    /**
+     * Index of generic glob patterns, sorted by length.
+     */
+    private final SortedMap<String, MimeType> globs =
+        new TreeMap<String, MimeType>(new Comparator<String>() {
+            public int compare(String a, String b) {
+                int diff = b.length() - a.length();
+                if (diff == 0) {
+                    diff = a.compareTo(b);
+                }
+                return diff;
+            }
+        });
+
+    public void add(String pattern, MimeType type) throws MimeTypeException {
+        assert pattern != null && type != null;
 
-    /** A list of other patterns */
-    private Map<String, MimeType> others = new HashMap<String, MimeType>();
-
-    void add(String[] patterns, MimeType type) {
-        // Some preliminary checks
-        if ((patterns == null) || (type == null)) {
-            return;
-        }
-        // All is ok, so add the patterns
-        for (String pattern : patterns) {
-            add(pattern, type);
+        if (pattern.indexOf('*') == -1
+                && pattern.indexOf('?') == -1
+                && pattern.indexOf('[') == -1) {
+            addName(pattern, type);
+        } else if (pattern.startsWith("*")
+                && pattern.indexOf('*', 1) == -1
+                && pattern.indexOf('?') == -1
+                && pattern.indexOf('[') == -1) {
+            addExtension(pattern.substring(1), type);
+        } else {
+            addGlob(compile(pattern), type);
         }
     }
 
-    void add(String pattern, MimeType type) {
-        // Some preliminary checks
-        if ((pattern == null) || (type == null)) {
-            return;
+    private void addName(String name, MimeType type) throws MimeTypeException {
+        MimeType previous = names.get(name);
+        if (previous == null || previous.isDescendantOf(type)) {
+            names.put(name, type);
+        } else if (previous == type || type.isDescendantOf(previous)) {
+            // do nothing
+        } else {
+            throw new MimeTypeException("Conflicting name pattern: " + name);
         }
+    }
 
-        // Add the pattern in the good index
-        if ((pattern.indexOf('*') == -1) && (pattern.indexOf('?') == -1)
-                && (pattern.indexOf('[') == -1)) {
-            exactIdx.put(pattern, type);
-        } else if (pattern.startsWith("*.")) {
-            extIdx.put(pattern.substring(2), type);
+    private void addExtension(String extension, MimeType type)
+            throws MimeTypeException {
+        MimeType previous = extensions.get(extension);
+        if (previous == null || previous.isDescendantOf(type)) {
+            extensions.put(extension, type);
+            int length = extension.length();
+            minExtensionLength = Math.min(minExtensionLength, length);
+            maxExtensionLength = Math.max(maxExtensionLength, length);
+        } else if (previous == type || type.isDescendantOf(previous)) {
+            // do nothing
         } else {
-            others.put(escape(pattern), type);
+            throw new MimeTypeException(
+                    "Conflicting extension pattern: " + extension);
         }
-        // Add the pattern in the list of patterns
-        patterns.add(pattern);
     }
 
-    String[] getPatterns() {
-        return patterns.toArray(new String[patterns.size()]);
+    private void addGlob(String glob, MimeType type)
+            throws MimeTypeException {
+        MimeType previous = globs.get(glob);
+        if (previous == null || previous.isDescendantOf(type)) {
+            extensions.put(glob, type);
+        } else if (previous == type || type.isDescendantOf(previous)) {
+            // do nothing
+        } else {
+            throw new MimeTypeException("Conflicting glob pattern: " + glob);
+        }
     }
 
     /**
@@ -107,78 +127,51 @@
      * special characters (`*?[') are matched before other wildcarded patterns
      * (since this covers the majority of the patterns).
      */
-    MimeType matches(String resourceName) {
-
-        // Preliminary check...
-        if (resourceName == null) {
-            return null;
-        }
+    public MimeType matches(String name) {
+        assert name != null;
 
         // First, try exact match of the provided resource name
-        MimeType type = exactIdx.get(resourceName);
-        if (type != null) {
-            return type;
-        }
-
-        // Then try exact match with only the resource name
-        String str = last(resourceName, '/');
-        if (str != null) {
-            type = exactIdx.get(str);
-            if (type != null) {
-                return type;
-            }
-        }
-        str = last(resourceName, '\\');
-        if (str != null) {
-            type = exactIdx.get(str);
-            if (type != null) {
-                return type;
-            }
+        if (names.containsKey(name)) {
+            return names.get(name);
         }
 
         // Then try "extension" (*.xxx) matching
-        int idx = resourceName.indexOf('.', 0);
-        while (idx != -1) {
-            type = extIdx.get(resourceName.substring(idx + 1));
-            if (type != null) {
-                return type;
+        int maxLength = Math.min(maxExtensionLength, name.length());
+        for (int n = maxLength; n >= minExtensionLength; n--) {
+            String extension = name.substring(name.length() - n);
+            if (extensions.containsKey(extension)) {
+                return extensions.get(extension);
             }
-            idx = resourceName.indexOf('.', idx + 1);
         }
 
         // And finally, try complex regexp matching
-        String longest = null;
-        for (String pattern : others.keySet()) {
-            if ((resourceName.matches(pattern))
-                    && (pattern.length() > longest.length())) {
-                longest = pattern;
+        for (Map.Entry<String, MimeType> entry : globs.entrySet()) {
+            if (name.matches(entry.getKey())) {
+                return entry.getValue();
             }
         }
-        if (longest != null) {
-            type = others.get(longest);
-        }
-        return type;
-    }
 
-    private final static String last(String str, char c) {
-        if (str == null) {
-            return null;
-        }
-        int idx = str.lastIndexOf(c);
-        if ((idx < 0) || (idx >= (str.length() - 1))) {
-            return null;
-        }
-        return str.substring(idx + 1);
+        return null;
     }
 
-    private final static String escape(String str) {
-        StringBuffer result = new StringBuffer(str.length());
-        for (int i = 0; i < str.length(); i++) {
-            String charAt = String.valueOf(str.charAt(i));
-            String replace = escapeMap.get(charAt);
-            result.append((replace != null) ? replace : charAt);
+    private String compile(String glob) {
+        StringBuilder pattern = new StringBuilder();
+        pattern.append("\\A");
+        for (int i = 0; i < glob.length(); i++) {
+            char ch = glob.charAt(i);
+            if (ch == '?') {
+                pattern.append('.');
+            } else if (ch == '*') {
+                pattern.append(".*");
+            } else if ("\\[]^.-$+(){}|".indexOf(ch) != -1) {
+                pattern.append('\\');
+                pattern.append(ch);
+            } else {
+                pattern.append(ch);
+            }
         }
-        return result.toString();
+        pattern.append("\\z");
+        return pattern.toString();
     }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Tue Nov  6 03:00:38 2007
@@ -104,49 +104,45 @@
     private MimeType getMimeType(InputStream stream, Metadata metadata)
             throws IOException {
         MimeTypes types = config.getMimeRepository();
-        MimeType type = null;
 
-        // Get type based on metadata hint (if available)
-        String typename = metadata.get(Metadata.CONTENT_TYPE);
-        if (typename != null) {
-            try {
-                type = types.forName(typename);
-            } catch (MimeTypeException e) {
-                // Malformed type name, ignore
-            }
-        }
-
-        // Get (or verify) type based on resourceName hint (if available)
-        String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
-        if (resourceName != null) {
-            MimeType match = types.getMimeType(resourceName);
-            if (match != null && (type == null || !type.matches(resourceName))) {
-                type = match;
-            }
-        }
-
-        // Get (or verify) type based on magic prefix
+        // Get type based on magic prefix
         stream.mark(types.getMinLength());
         try {
             byte[] prefix = getPrefix(stream, types.getMinLength());
-            MimeType match = types.getMimeType(prefix);
-            if (match != null && (type == null || !type.matches(prefix))) {
-                type = match;
+            MimeType type = types.getMimeType(prefix);
+            if (type != null) {
+                return type;
             }
         } finally {
             stream.reset();
         }
 
-        // Finally, use the default type if no matches found
-        if (type == null) {
+        // Get type based on resourceName hint (if available)
+        String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+        if (resourceName != null) {
+            MimeType type = types.getMimeType(resourceName);
+            if (type != null) {
+                return type;
+            }
+        }
+
+        // Get type based on metadata hint (if available)
+        String typename = metadata.get(Metadata.CONTENT_TYPE);
+        if (typename != null) {
             try {
-                type = types.forName(MimeTypes.DEFAULT);
+                return types.forName(typename);
             } catch (MimeTypeException e) {
-                // Should never happen
+                // Malformed type name, ignore
             }
         }
 
-        return type;
+        // Finally, use the default type if no matches found
+        try {
+            return types.forName(MimeTypes.DEFAULT);
+        } catch (MimeTypeException e) {
+            // Should never happen
+            return null;
+        }
     }
 
     /**

Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Tue Nov  6 03:00:38 2007
@@ -92,7 +92,6 @@
 		<glob pattern="*.rng" />
 		<glob pattern="*.rnx" />
 		<glob pattern="*.roles" />
-		<glob pattern="*.rss" />
 		<glob pattern="*.sh" />
 		<glob pattern="*.sql" />
 		<glob pattern="*.svg" />

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Tue Nov  6 03:00:38 2007
@@ -49,7 +49,7 @@
      * @param tp the parameters encapsulated in a TestParams instance
      * @throws IOException
      */
-    private void assertAutoDetect(TestParams tp) throws IOException {
+    private void assertAutoDetect(TestParams tp) throws Exception {
 
         InputStream input =
             AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName);
@@ -72,9 +72,6 @@
 
             assertTrue("Expected content not found: " + tp,
                     writer.toString().contains(tp.expectedContentFragment));
-        } catch (Throwable t) {
-            fail("Test error asserting auto detect for parameters: " + t
-                    + "\nParameters: " + tp);
         } finally {
             input.close();
         }