You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/11/06 12:00:39 UTC
svn commit: r592371 - in /incubator/tika/trunk/src:
main/java/org/apache/tika/mime/ main/java/org/apache/tika/parser/
main/resources/mime/ test/java/org/apache/tika/parser/
Author: jukka
Date: Tue Nov 6 03:00:38 2007
New Revision: 592371
URL: http://svn.apache.org/viewvc?rev=592371&view=rev
Log:
TIKA-87 - MimeTypes should allow modification of MIME types
- Streamlined pattern handling
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Tue Nov 6 03:00:38 2007
@@ -92,9 +92,6 @@
*/
private final SortedSet<MimeType> subTypes = new TreeSet<MimeType>();
- /** The Mime-Type associated recognition patterns */
- private final Patterns patterns = new Patterns();
-
/** The magics associated to this Mime-Type */
private final ArrayList<Magic> magics = new ArrayList<Magic>();
@@ -203,25 +200,6 @@
}
/**
- * Adds a file name pattern for this media type.
- *
- * @param pattern file name pattern
- */
- public void addPattern(String pattern) {
- registry.addPattern(this, pattern);
- patterns.add(pattern, this);
- }
-
- /**
- * Returns the file name patterns for this media type.
- *
- * @return file name patterns
- */
- public String[] getPatterns() {
- return patterns.getPatterns();
- }
-
- /**
* Returns the aliases of this media type. The returned set is
* newly allocated and can be freely modified by the client.
*
@@ -297,10 +275,6 @@
public boolean hasMagic() {
return (magics.size() > 0);
- }
-
- public boolean matches(String url) {
- return (patterns.matches(url) == this);
}
public boolean matchesMagic(byte[] data) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Tue Nov 6 03:00:38 2007
@@ -94,15 +94,19 @@
*
* @param name
* of the document to analyze.
- * @return the Mime Content Type of the specified document name, or
- * <code>null</code> if none is found.
+ * @return the Mime Content Type of the specified document name
*/
public MimeType getMimeType(String name) {
- MimeType type = patterns.matches(name.toLowerCase());
- if (type != null)
+ MimeType type = patterns.matches(name);
+ if (type != null) {
+ return type;
+ }
+ type = patterns.matches(name.toLowerCase());
+ if (type != null) {
return type;
- // if it's null here, then return the default type
- return root;
+ } else {
+ return root;
+ }
}
/**
@@ -307,13 +311,14 @@
}
/**
- * Adds a file name pattern for the given media type. This method should
- * only be called from {@link MimeType#addPattern(String)}.
+ * Adds a file name pattern for the given media type.
*
* @param type media type
* @param pattern file name pattern
+ * @throws MimeTypeException if the pattern conflicts with existing ones
*/
- void addPattern(MimeType type, String pattern) {
+ public void addPattern(MimeType type, String pattern)
+ throws MimeTypeException {
patterns.add(pattern, type);
}
@@ -328,21 +333,6 @@
public int getMinLength() {
return 1024;
// return minLength;
- }
-
- /**
- * Add the specified mime-types in the repository.
- *
- * @param types
- * are the mime-types to add.
- */
- void add(MimeType[] types) {
- if (types == null) {
- return;
- }
- for (int i = 0; i < types.length; i++) {
- add(types[i]);
- }
}
/**
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Tue Nov 6 03:00:38 2007
@@ -159,7 +159,7 @@
type.setDescription(
nodeElement.getFirstChild().getNodeValue());
} else if (nodeElement.getTagName().equals("glob")) {
- type.addPattern(nodeElement.getAttribute("pattern"));
+ types.addPattern(type, nodeElement.getAttribute("pattern"));
} else if (nodeElement.getTagName().equals("magic")) {
readMagic(nodeElement, type);
} else if (nodeElement.getTagName().equals("alias")) {
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/Patterns.java Tue Nov 6 03:00:38 2007
@@ -17,80 +17,100 @@
package org.apache.tika.mime;
// JDK imports
-import java.util.ArrayList;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
/**
* Defines a MimeType pattern.
*/
class Patterns {
- private static Map<Character, String> escapeMap =
- new HashMap<Character, String>();
+ /**
+ * Index of exact name patterns.
+ */
+ private final Map<String, MimeType> names = new HashMap<String, MimeType>();
- static {
- escapeMap.put('\\', "\\\\");
- escapeMap.put('?', "\\?");
- escapeMap.put('[', "\\[");
- escapeMap.put(']', "\\]");
- escapeMap.put('^', "\\^");
- escapeMap.put('.', "\\.");
- escapeMap.put('-', "\\-");
- escapeMap.put('$', "\\$");
- escapeMap.put('+', "\\+");
- escapeMap.put('(', "\\(");
- escapeMap.put(')', "\\)");
- escapeMap.put('{', "\\{");
- escapeMap.put('}', "\\}");
- escapeMap.put('|', "\\|");
- escapeMap.put('*', ".*");
- }
+ /**
+ * Index of extension patterns of the form "*extension".
+ */
+ private final Map<String, MimeType> extensions =
+ new HashMap<String, MimeType>();
- /** Gathers all the patterns */
- private ArrayList<String> patterns = new ArrayList<String>();
+ private int minExtensionLength = Integer.MAX_VALUE;
- /** An index of exact matching patterns */
- private Map<String, MimeType> exactIdx = new HashMap<String, MimeType>();
+ private int maxExtensionLength = 0;
- /** An index of the patterns of the form "*.ext" */
- private Map<String, MimeType> extIdx = new HashMap<String, MimeType>();
+ /**
+ * Index of generic glob patterns, sorted by length.
+ */
+ private final SortedMap<String, MimeType> globs =
+ new TreeMap<String, MimeType>(new Comparator<String>() {
+ public int compare(String a, String b) {
+ int diff = b.length() - a.length();
+ if (diff == 0) {
+ diff = a.compareTo(b);
+ }
+ return diff;
+ }
+ });
+
+ public void add(String pattern, MimeType type) throws MimeTypeException {
+ assert pattern != null && type != null;
- /** A list of other patterns */
- private Map<String, MimeType> others = new HashMap<String, MimeType>();
-
- void add(String[] patterns, MimeType type) {
- // Some preliminary checks
- if ((patterns == null) || (type == null)) {
- return;
- }
- // All is ok, so add the patterns
- for (String pattern : patterns) {
- add(pattern, type);
+ if (pattern.indexOf('*') == -1
+ && pattern.indexOf('?') == -1
+ && pattern.indexOf('[') == -1) {
+ addName(pattern, type);
+ } else if (pattern.startsWith("*")
+ && pattern.indexOf('*', 1) == -1
+ && pattern.indexOf('?') == -1
+ && pattern.indexOf('[') == -1) {
+ addExtension(pattern.substring(1), type);
+ } else {
+ addGlob(compile(pattern), type);
}
}
- void add(String pattern, MimeType type) {
- // Some preliminary checks
- if ((pattern == null) || (type == null)) {
- return;
+ private void addName(String name, MimeType type) throws MimeTypeException {
+ MimeType previous = names.get(name);
+ if (previous == null || previous.isDescendantOf(type)) {
+ names.put(name, type);
+ } else if (previous == type || type.isDescendantOf(previous)) {
+ // do nothing
+ } else {
+ throw new MimeTypeException("Conflicting name pattern: " + name);
}
+ }
- // Add the pattern in the good index
- if ((pattern.indexOf('*') == -1) && (pattern.indexOf('?') == -1)
- && (pattern.indexOf('[') == -1)) {
- exactIdx.put(pattern, type);
- } else if (pattern.startsWith("*.")) {
- extIdx.put(pattern.substring(2), type);
+ private void addExtension(String extension, MimeType type)
+ throws MimeTypeException {
+ MimeType previous = extensions.get(extension);
+ if (previous == null || previous.isDescendantOf(type)) {
+ extensions.put(extension, type);
+ int length = extension.length();
+ minExtensionLength = Math.min(minExtensionLength, length);
+ maxExtensionLength = Math.max(maxExtensionLength, length);
+ } else if (previous == type || type.isDescendantOf(previous)) {
+ // do nothing
} else {
- others.put(escape(pattern), type);
+ throw new MimeTypeException(
+ "Conflicting extension pattern: " + extension);
}
- // Add the pattern in the list of patterns
- patterns.add(pattern);
}
- String[] getPatterns() {
- return patterns.toArray(new String[patterns.size()]);
+ private void addGlob(String glob, MimeType type)
+ throws MimeTypeException {
+ MimeType previous = globs.get(glob);
+ if (previous == null || previous.isDescendantOf(type)) {
+ extensions.put(glob, type);
+ } else if (previous == type || type.isDescendantOf(previous)) {
+ // do nothing
+ } else {
+ throw new MimeTypeException("Conflicting glob pattern: " + glob);
+ }
}
/**
@@ -107,78 +127,51 @@
* special characters (`*?[') are matched before other wildcarded patterns
* (since this covers the majority of the patterns).
*/
- MimeType matches(String resourceName) {
-
- // Preliminary check...
- if (resourceName == null) {
- return null;
- }
+ public MimeType matches(String name) {
+ assert name != null;
// First, try exact match of the provided resource name
- MimeType type = exactIdx.get(resourceName);
- if (type != null) {
- return type;
- }
-
- // Then try exact match with only the resource name
- String str = last(resourceName, '/');
- if (str != null) {
- type = exactIdx.get(str);
- if (type != null) {
- return type;
- }
- }
- str = last(resourceName, '\\');
- if (str != null) {
- type = exactIdx.get(str);
- if (type != null) {
- return type;
- }
+ if (names.containsKey(name)) {
+ return names.get(name);
}
// Then try "extension" (*.xxx) matching
- int idx = resourceName.indexOf('.', 0);
- while (idx != -1) {
- type = extIdx.get(resourceName.substring(idx + 1));
- if (type != null) {
- return type;
+ int maxLength = Math.min(maxExtensionLength, name.length());
+ for (int n = maxLength; n >= minExtensionLength; n--) {
+ String extension = name.substring(name.length() - n);
+ if (extensions.containsKey(extension)) {
+ return extensions.get(extension);
}
- idx = resourceName.indexOf('.', idx + 1);
}
// And finally, try complex regexp matching
- String longest = null;
- for (String pattern : others.keySet()) {
- if ((resourceName.matches(pattern))
- && (pattern.length() > longest.length())) {
- longest = pattern;
+ for (Map.Entry<String, MimeType> entry : globs.entrySet()) {
+ if (name.matches(entry.getKey())) {
+ return entry.getValue();
}
}
- if (longest != null) {
- type = others.get(longest);
- }
- return type;
- }
- private final static String last(String str, char c) {
- if (str == null) {
- return null;
- }
- int idx = str.lastIndexOf(c);
- if ((idx < 0) || (idx >= (str.length() - 1))) {
- return null;
- }
- return str.substring(idx + 1);
+ return null;
}
- private final static String escape(String str) {
- StringBuffer result = new StringBuffer(str.length());
- for (int i = 0; i < str.length(); i++) {
- String charAt = String.valueOf(str.charAt(i));
- String replace = escapeMap.get(charAt);
- result.append((replace != null) ? replace : charAt);
+ private String compile(String glob) {
+ StringBuilder pattern = new StringBuilder();
+ pattern.append("\\A");
+ for (int i = 0; i < glob.length(); i++) {
+ char ch = glob.charAt(i);
+ if (ch == '?') {
+ pattern.append('.');
+ } else if (ch == '*') {
+ pattern.append(".*");
+ } else if ("\\[]^.-$+(){}|".indexOf(ch) != -1) {
+ pattern.append('\\');
+ pattern.append(ch);
+ } else {
+ pattern.append(ch);
+ }
}
- return result.toString();
+ pattern.append("\\z");
+ return pattern.toString();
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Tue Nov 6 03:00:38 2007
@@ -104,49 +104,45 @@
private MimeType getMimeType(InputStream stream, Metadata metadata)
throws IOException {
MimeTypes types = config.getMimeRepository();
- MimeType type = null;
- // Get type based on metadata hint (if available)
- String typename = metadata.get(Metadata.CONTENT_TYPE);
- if (typename != null) {
- try {
- type = types.forName(typename);
- } catch (MimeTypeException e) {
- // Malformed type name, ignore
- }
- }
-
- // Get (or verify) type based on resourceName hint (if available)
- String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (resourceName != null) {
- MimeType match = types.getMimeType(resourceName);
- if (match != null && (type == null || !type.matches(resourceName))) {
- type = match;
- }
- }
-
- // Get (or verify) type based on magic prefix
+ // Get type based on magic prefix
stream.mark(types.getMinLength());
try {
byte[] prefix = getPrefix(stream, types.getMinLength());
- MimeType match = types.getMimeType(prefix);
- if (match != null && (type == null || !type.matches(prefix))) {
- type = match;
+ MimeType type = types.getMimeType(prefix);
+ if (type != null) {
+ return type;
}
} finally {
stream.reset();
}
- // Finally, use the default type if no matches found
- if (type == null) {
+ // Get type based on resourceName hint (if available)
+ String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (resourceName != null) {
+ MimeType type = types.getMimeType(resourceName);
+ if (type != null) {
+ return type;
+ }
+ }
+
+ // Get type based on metadata hint (if available)
+ String typename = metadata.get(Metadata.CONTENT_TYPE);
+ if (typename != null) {
try {
- type = types.forName(MimeTypes.DEFAULT);
+ return types.forName(typename);
} catch (MimeTypeException e) {
- // Should never happen
+ // Malformed type name, ignore
}
}
- return type;
+ // Finally, use the default type if no matches found
+ try {
+ return types.forName(MimeTypes.DEFAULT);
+ } catch (MimeTypeException e) {
+ // Should never happen
+ return null;
+ }
}
/**
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Tue Nov 6 03:00:38 2007
@@ -92,7 +92,6 @@
<glob pattern="*.rng" />
<glob pattern="*.rnx" />
<glob pattern="*.roles" />
- <glob pattern="*.rss" />
<glob pattern="*.sh" />
<glob pattern="*.sql" />
<glob pattern="*.svg" />
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java?rev=592371&r1=592370&r2=592371&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java Tue Nov 6 03:00:38 2007
@@ -49,7 +49,7 @@
* @param tp the parameters encapsulated in a TestParams instance
* @throws IOException
*/
- private void assertAutoDetect(TestParams tp) throws IOException {
+ private void assertAutoDetect(TestParams tp) throws Exception {
InputStream input =
AutoDetectParserTest.class.getResourceAsStream(tp.resourceRealName);
@@ -72,9 +72,6 @@
assertTrue("Expected content not found: " + tp,
writer.toString().contains(tp.expectedContentFragment));
- } catch (Throwable t) {
- fail("Test error asserting auto detect for parameters: " + t
- + "\nParameters: " + tp);
} finally {
input.close();
}