You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2007/11/04 10:12:32 UTC
svn commit: r591743 - in /incubator/tika/trunk/src:
main/java/org/apache/tika/mime/ main/java/org/apache/tika/parser/
main/resources/mime/ test/java/org/apache/tika/mime/
Author: jukka
Date: Sun Nov 4 01:12:30 2007
New Revision: 591743
URL: http://svn.apache.org/viewvc?rev=591743&view=rev
Log:
TIKA-87 - MimeTypes should allow modification of MIME types
- MimeType.addAlias(String) can now be used to add new aliases
- MimeType.addPattern(String) can now be used to add new patterns
- MimeTypes.forName(String) validates the name
- MimeTypes.forName(String) creates and registers the type if needed
- Simplified type name handling and validation
- New test cases
Added:
incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java (with props)
incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java (with props)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeType.java Sun Nov 4 01:12:30 2007
@@ -18,53 +18,82 @@
// JDK imports
import java.util.ArrayList;
-import java.util.Collections;
+import java.util.SortedSet;
+import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.tika.utils.StringUtil;
/**
- * Defines a Mime Content Type.
- *
- *
+ * Internet media type.
*/
public final class MimeType implements Comparable<MimeType> {
- /** The primary and sub types separator */
- private final static String SEPARATOR = "/";
-
- /** The parameters separator */
- private final static String PARAMS_SEP = ";";
-
- /** Special characters not allowed in content types. */
- private final static String SPECIALS = "()<>@,;:\\\"/[]?=";
-
- /** The Mime-Type full name */
- private String name = null;
+ /**
+ * Checks that the given string is a valid Internet media type name
+ * based on rules from RFC 2054 section 5.3. For validation purposes the
+ * rules can be simplified to the following:
+ * <pre>
+ * name := token "/" token
+ * token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
+ * tspecials := "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" /
+ * "\" / <"> / "/" / "[" / "]" / "?" / "="
+ * </pre>
+ *
+ * @param name name string
+ * @return <code>true</code> if the string is a valid media type name,
+ * <code>false</code> otherwise
+ */
+ public static boolean isValid(String name) {
+ assert name != null;
+
+ boolean slash = false;
+ for (int i = 0; i < name.length(); i++) {
+ char ch = name.charAt(i);
+ if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' ||
+ ch == '<' || ch == '>' || ch == '@' || ch == ',' ||
+ ch == ';' || ch == ':' || ch == '\\' || ch == '"' ||
+ ch == '[' || ch == ']' || ch == '?' || ch == '=') {
+ return false;
+ } else if (ch == '/') {
+ if (slash || i == 0 || i + 1 == name.length()) {
+ return false;
+ }
+ slash = true;
+ }
+ }
+ return slash;
+ }
- /** The Mime-Type primary type */
- private String primary = null;
+ /**
+ * The media type registry that contains this type.
+ */
+ private final MimeTypes registry;
- /** The Mime-Type sub type */
- private String sub = null;
+ /**
+ * Lower case name of this media type.
+ */
+ private final String name;
/** The Mime-Type description */
private String description = null;
/** The Mime-Type associated recognition patterns */
- private Patterns patterns = null;
+ private final Patterns patterns = new Patterns();
/** The magics associated to this Mime-Type */
- private ArrayList<Magic> magics = null;
+ private final ArrayList<Magic> magics = new ArrayList<Magic>();
- /** The aliases Mime-Types for this one */
- private ArrayList<String> aliases = null;
+ /**
+ * Lower case alias names of this media type.
+ */
+ private final SortedSet<String> aliases = new TreeSet<String>();
/** The root-XML associated to this Mime-Type */
- private ArrayList<RootXML> rootXML = null;
+ private final ArrayList<RootXML> rootXML = new ArrayList<RootXML>();
/** The sub-class-of associated to this Mime-Type */
- private ArrayList<String> superTypes = null;
+ private final ArrayList<String> superTypes = new ArrayList<String>();
/** The mime-type level (regarding its subTypes) */
private int level = 0;
@@ -73,167 +102,32 @@
private int minLength = 0;
/**
- * Creates a MimeType from a String.
- *
- * @param name
- * the MIME content type String.
- */
- public MimeType(String name) throws MimeTypeException {
-
- if (name == null || name.length() <= 0) {
- throw new MimeTypeException("The type can not be null or empty");
- }
-
- // Split the two parts of the Mime Content Type
- String[] parts = name.split(SEPARATOR, 2);
-
- // Checks validity of the parts
- if (parts.length != 2) {
- throw new MimeTypeException("Invalid Content Type " + name);
- }
- init(parts[0], parts[1]);
+ * Creates a media type with the give name and containing media type
+ * registry. The name is expected to be valid and normalized to lower
+ * case. This constructor should only be called by
+ * {@link MimeTypes#forName(String)} to keep the media type mapping
+ * up to date.
+ *
+ * @param registry the media type registry that contains this type
+ * @param name media type name
+ */
+ MimeType(MimeTypes registry, String name) {
+ assert registry != null;
+ assert isValid(name) && name.equals(name.toLowerCase());
+ this.registry = registry;
+ this.name = name;
}
/**
- * Creates a MimeType with the given primary type and sub type.
+ * Returns the name of this Internet media type.
*
- * @param primary
- * the content type primary type.
- * @param sub
- * the content type sub type.
- */
- public MimeType(String primary, String sub) throws MimeTypeException {
- init(primary, sub);
- }
-
- /** Init method used by constructors. */
- private void init(String primary, String sub) throws MimeTypeException {
-
- // Preliminary checks...
- if ((primary == null) || (primary.length() <= 0) || (!isValid(primary))) {
- throw new MimeTypeException("Invalid Primary Type " + primary);
- }
- // Remove optional parameters from the sub type
- String clearedSub = null;
- if (sub != null) {
- clearedSub = sub.split(PARAMS_SEP)[0];
- }
- if ((clearedSub == null) || (clearedSub.length() <= 0)
- || (!isValid(clearedSub))) {
- throw new MimeTypeException("Invalid Sub Type " + clearedSub);
- }
-
- // All is ok, assign values
- this.primary = primary.toLowerCase().trim();
- this.sub = clearedSub.toLowerCase().trim();
- this.name = this.primary + SEPARATOR + this.sub;
- this.patterns = new Patterns();
- this.magics = new ArrayList<Magic>();
- this.aliases = new ArrayList<String>();
- this.rootXML = new ArrayList<RootXML>();
- this.superTypes = new ArrayList<String>();
- }
-
- /**
- * Cleans a content-type. This method cleans a content-type by removing its
- * optional parameters and returning only its
- * <code>primary-type/sub-type</code>.
- *
- * @param type
- * is the content-type to clean.
- * @return the cleaned version of the specified content-type.
- * @throws MimeTypeException
- * if something wrong occurs during the parsing/cleaning of the
- * specified type.
- */
- public final static String clean(String type) throws MimeTypeException {
- return (new MimeType(type)).getName();
- }
-
- /**
- * Return the name of this mime-type.
- *
- * @return the name of this mime-type.
+ * @return media type name (lower case)
*/
public String getName() {
return name;
}
/**
- * Return the primary type of this mime-type.
- *
- * @return the primary type of this mime-type.
- */
- public String getPrimaryType() {
- return primary;
- }
-
- /**
- * Return the sub type of this mime-type.
- *
- * @return the sub type of this mime-type.
- */
- public String getSubType() {
- return sub;
- }
-
- // Inherited Javadoc
- public String toString() {
- StringBuffer buf = new StringBuffer();
- buf.append(name).append(" -- ").append(getDescription()).append("\n")
- .append("Aliases: ");
- if (aliases.size() < 1) {
- buf.append(" NONE");
- }
- buf.append("\n");
- for (int i = 0; i < aliases.size(); i++) {
- buf.append("\t").append(aliases.get(i)).append("\n");
- }
- buf.append("Patterns:");
- String[] patterns = this.patterns.getPatterns();
- if (patterns.length < 1) {
- buf.append(" NONE");
- }
- buf.append("\n");
- for (int i = 0; i < patterns.length; i++) {
- buf.append("\t").append(patterns[i]).append("\n");
- }
- buf.append("Magics: ");
- if (magics.size() < 1) {
- buf.append(" NONE");
- }
- buf.append("\n");
- for (int i = 0; i < magics.size(); i++) {
- buf.append("\t").append(magics.get(i)).append("\n");
- }
-
- return buf.toString();
- }
-
- /**
- * Indicates if an object is equal to this mime-type. The specified object
- * is equal to this mime-type if it is not null, and it is an instance of
- * MimeType and its name is equals to this mime-type.
- *
- * @param object
- * the reference object with which to compare.
- * @return <code>true</code> if this mime-type is equal to the object
- * argument; <code>false</code> otherwise.
- */
- public boolean equals(Object object) {
- try {
- return ((MimeType) object).getName().equals(this.name);
- } catch (Exception e) {
- return false;
- }
- }
-
- // Inherited Javadoc
- public int hashCode() {
- return name.hashCode();
- }
-
- /**
* Return the description of this mime-type.
*
* @return the description of this mime-type.
@@ -253,32 +147,51 @@
}
/**
- * Add a supported file-naming pattern.
- *
- * @param pattern
- * to add to the list of recognition pattern for this mime-type.
+ * Adds a file name pattern for this media type.
+ *
+ * @param pattern file name pattern
*/
- void addPattern(String pattern) {
+ public synchronized void addPattern(String pattern) {
+ registry.addPattern(this, pattern);
patterns.add(pattern, this);
}
/**
- * Return the recogition patterns for this mime-type
+ * Returns the file name patterns for this media type.
*
- * @return the recoginition patterns associated to this mime-type.
+ * @return file name patterns
*/
- String[] getPatterns() {
+ public synchronized String[] getPatterns() {
return patterns.getPatterns();
}
/**
- * Add an alias to this mime-type
- *
- * @param alias
- * to add to this mime-type.
- */
- void addAlias(String alias) {
- aliases.add(alias);
+ * Returns the aliases of this media type. The returned set is
+ * newly allocated and can be freely modified by the client.
+ *
+ * @return media type aliases
+ */
+ public synchronized SortedSet<String> getAliases() {
+ return new TreeSet<String>(aliases);
+ }
+
+ /**
+ * Adds an alias name for this media type.
+ *
+ * @param alias media type alias (case insensitive)
+ * @throws MimeTypeException if the alias is invalid
+ * or already registered for another media type
+ */
+ public synchronized void addAlias(String alias) throws MimeTypeException {
+ if (isValid(alias)) {
+ alias = alias.toLowerCase();
+ if (!name.equals(alias) && !aliases.contains(alias)) {
+ registry.addAlias(this, alias);
+ aliases.add(alias);
+ }
+ } else {
+ throw new MimeTypeException("Invalid media type alias: " + alias);
+ }
}
/**
@@ -336,15 +249,6 @@
this.level = level;
}
- /**
- * Return the recogition patterns for this mime-type
- *
- * @return the recoginition patterns associated to this mime-type.
- */
- public String[] getAliases() {
- return aliases.toArray(new String[aliases.size()]);
- }
-
Magic[] getMagics() {
return magics.toArray(new Magic[magics.size()]);
}
@@ -382,26 +286,6 @@
return matchesXML(data) || matchesMagic(data);
}
- /** Checks if the specified primary or sub type is valid. */
- private boolean isValid(String type) {
- return (type != null) && (type.trim().length() > 0)
- && !hasCtrlOrSpecials(type);
- }
-
- /** Checks if the specified string contains some special characters. */
- private boolean hasCtrlOrSpecials(String type) {
- int len = type.length();
- int i = 0;
- while (i < len) {
- char c = type.charAt(i);
- if (c <= '\032' || SPECIALS.indexOf(c) > 0) {
- return true;
- }
- i++;
- }
- return false;
- }
-
/**
* Defines a RootXML description. RootXML is made of a localName and/or a
* namespaceURI.
@@ -468,12 +352,25 @@
}
}
+ //----------------------------------------------------------< Comparable >
+
public int compareTo(MimeType o) {
int diff = level - o.level;
if (diff == 0) {
diff = name.compareTo(o.name);
}
return diff;
+ }
+
+ //--------------------------------------------------------------< Object >
+
+ /**
+ * Returns the name of this Internet media type.
+ *
+ * @return media type name
+ */
+ public String toString() {
+ return name;
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Nov 4 01:12:30 2007
@@ -48,8 +48,10 @@
/** The default <code>application/octet-stream</code> MimeType */
public final static String DEFAULT = "application/octet-stream";
+ private final MimeType root;
+
/** All the registered MimeTypes indexed on their name */
- private Map<String, MimeType> types = new HashMap<String, MimeType>();
+ private final Map<String, MimeType> types = new HashMap<String, MimeType>();
/** The patterns matcher */
private Patterns patterns = new Patterns();
@@ -66,6 +68,11 @@
/** The minimum length of data to provide to check all MimeTypes */
private int minLength = 0;
+ public MimeTypes() {
+ root = new MimeType(this, DEFAULT);
+ types.put(root.getName(), root);
+ }
+
/**
* Find the Mime Content Type of a file.
*
@@ -103,7 +110,7 @@
if (type != null)
return type;
// if it's null here, then return the default type
- return forName(DEFAULT);
+ return root;
}
/**
@@ -183,30 +190,21 @@
}
public String getType(String typeName, String url, byte[] data) {
- MimeType type = null;
- try {
- typeName = MimeType.clean(typeName);
- type = typeName == null ? null : forName(typeName);
- } catch (MimeTypeException mte) {
- // Seems to be a malformed mime type name...
- }
-
- if (typeName == null || type == null || !type.matches(url)) {
- // If no mime-type header, or cannot find a corresponding registered
- // mime-type, or the one found doesn't match the url pattern
- // it shouldbe, then guess a mime-type from the url pattern
- type = getMimeType(url);
- typeName = type == null ? typeName : type.getName();
- }
- // if (typeName == null || type == null ||
- // (this.magic && type.hasMagic() && !type.matches(data))) {
- // If no mime-type already found, or the one found doesn't match
- // the magic bytes it should be, then, guess a mime-type from the
- // document content (magic bytes)
- type = getMimeType(data);
- typeName = type == null ? typeName : type.getName();
- // }
- return typeName;
+ MimeType type = getMimeType(url, data);
+
+ if (type == null && typeName != null) {
+ try {
+ type = forName(typeName);
+ } catch (MimeTypeException e) {
+ // Invalid type name hint
+ }
+ }
+
+ if (type == null) {
+ type = root;
+ }
+
+ return type.getName();
}
/**
@@ -274,15 +272,56 @@
}
/**
- * Find a Mime Content Type from its name.
- *
- * @param name
- * is the content type name
- * @return the MimeType for the specified name, or <code>null</code> if no
- * MimeType is registered for this name.
+ * Returns the registered media type with the given name (or alias).
+ * The named media type is automatically registered (and returned) if
+ * it doesn't already exist.
+ *
+ * @param name media type name (case-insensitive)
+ * @return the registered media type with the given name or alias
+ * @throws MimeTypeException if the given media type name is invalid
+ */
+ public synchronized MimeType forName(String name)
+ throws MimeTypeException {
+ if (MimeType.isValid(name)) {
+ name = name.toLowerCase();
+ MimeType type = types.get(name);
+ if (type == null) {
+ type = new MimeType(this, name);
+ types.put(name, type);
+ }
+ return type;
+ } else {
+ throw new MimeTypeException("Invalid media type name: " + name);
+ }
+ }
+
+ /**
+ * Adds an alias for the given media type. This method should only
+ * be called from {@link MimeType#addAlias(String)}.
+ *
+ * @param type media type
+ * @param alias media type alias (normalized to lower case)
+ * @throws MimeTypeException if the alias already exists
+ */
+ synchronized void addAlias(MimeType type, String alias)
+ throws MimeTypeException {
+ if (!types.containsKey(alias)) {
+ types.put(alias, type);
+ } else {
+ throw new MimeTypeException(
+ "Media type alias already exists: " + alias);
+ }
+ }
+
+ /**
+ * Adds a file name pattern for the given media type. This method should
+ * only be called from {@link MimeType#addPattern(String)}.
+ *
+ * @param type media type
+ * @param pattern file name pattern
*/
- public MimeType forName(String name) {
- return types.get(name);
+ synchronized void addPattern(MimeType type, String pattern) {
+ patterns.add(pattern, type);
}
/**
@@ -351,8 +390,6 @@
// Update minLentgth
minLength = Math.max(minLength, type.getMinLength());
- // Update the extensions index...
- patterns.add(type.getPatterns(), type);
// Update the magics index...
if (type.hasMagic()) {
magics.addAll(Arrays.asList(type.getMagics()));
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/mime/MimeTypesReader.java Sun Nov 4 01:12:30 2007
@@ -130,79 +130,60 @@
void read(Document document) {
Element element = document.getDocumentElement();
if (element != null && element.getTagName().equals("mime-info")) {
- readMimeInfo(element);
- }
- }
-
- /** Read Element named mime-info. */
- private MimeType[] readMimeInfo(Element element) {
- ArrayList<MimeType> types = new ArrayList<MimeType>();
- NodeList nodes = element.getChildNodes();
- for (int i = 0; i < nodes.getLength(); i++) {
- Node node = nodes.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element nodeElement = (Element) node;
- if (nodeElement.getTagName().equals("mime-type")) {
- readMimeType(nodeElement);
+ NodeList nodes = element.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element child = (Element) node;
+ if (child.getTagName().equals("mime-type")) {
+ readMimeType(child);
+ }
}
}
+ } else {
+ logger.warn("Not a <mime-info/> configuration document");
}
- return types.toArray(new MimeType[types.size()]);
}
/** Read Element named mime-type. */
private void readMimeType(Element element) {
-
- MimeType type = null;
-
+ String name = element.getAttribute("type");
try {
- type = new MimeType(element.getAttribute("type"));
- } catch (MimeTypeException mte) {
- // Mime Type not valid... just ignore it
- if (logger.isInfoEnabled()) {
- logger.info(mte.toString() + " ... Ignoring!");
- }
- return;
- }
+ MimeType type = types.forName(name);
- NodeList nodes = element.getChildNodes();
- for (int i = 0; i < nodes.getLength(); i++) {
- Node node = nodes.item(i);
- if (node.getNodeType() == Node.ELEMENT_NODE) {
- Element nodeElement = (Element) node;
- if (nodeElement.getTagName().equals("_comment")) {
- type.setDescription(nodeElement.getFirstChild()
- .getNodeValue());
- } else if (nodeElement.getTagName().equals("glob")) {
- readGlob(nodeElement, type);
- } else if (nodeElement.getTagName().equals("magic")) {
- readMagic(nodeElement, type);
- } else if (nodeElement.getTagName().equals("alias")) {
- readAlias(nodeElement, type);
- } else if (nodeElement.getTagName().equals("root-XML")) {
- readRootXML(nodeElement, type);
- } else if (nodeElement.getTagName().equals("sub-class-of")) {
- readSubClassOf(nodeElement, type);
+ NodeList nodes = element.getChildNodes();
+ for (int i = 0; i < nodes.getLength(); i++) {
+ Node node = nodes.item(i);
+ if (node.getNodeType() == Node.ELEMENT_NODE) {
+ Element nodeElement = (Element) node;
+ if (nodeElement.getTagName().equals("_comment")) {
+ type.setDescription(
+ nodeElement.getFirstChild().getNodeValue());
+ } else if (nodeElement.getTagName().equals("glob")) {
+ type.addPattern(nodeElement.getAttribute("pattern"));
+ } else if (nodeElement.getTagName().equals("magic")) {
+ readMagic(nodeElement, type);
+ } else if (nodeElement.getTagName().equals("alias")) {
+ String alias = nodeElement.getAttribute("type");
+ try {
+ type.addAlias(alias);
+ } catch (MimeTypeException e) {
+ logger.warn("Invalid media type alias: " + alias, e);
+ }
+ } else if (nodeElement.getTagName().equals("root-XML")) {
+ readRootXML(nodeElement, type);
+ } else if (nodeElement.getTagName().equals("sub-class-of")) {
+ readSubClassOf(nodeElement, type);
+ }
}
}
+ } catch (MimeTypeException e) {
+ logger.warn("Invalid media type configuration entry: " + name, e);
}
-
- types.add(type);
- }
-
- /** Read Element named glob. */
- private void readGlob(Element element, MimeType type) {
- type.addPattern(element.getAttribute("pattern"));
- }
-
- /** Read Element named alias. */
- private void readAlias(Element element, MimeType type) {
- type.addAlias(element.getAttribute("type"));
}
/** Read Element named magic. */
private void readMagic(Element element, MimeType mimeType) {
-
Magic magic = null;
try {
magic = new Magic(Integer
@@ -296,83 +277,6 @@
private void readSubClassOf(Element element, MimeType mimeType) {
mimeType.addSuperType(element.getAttribute("type"));
- }
-
- /** Prints the specified node, then prints all of its children. */
- public static void printDOM(Node node) {
- int type = node.getNodeType();
- switch (type) {
- // print the document element
- case Node.DOCUMENT_NODE: {
- System.out.println("<?xml version=\"1.0\" ?>");
- printDOM(((Document) node).getDocumentElement());
- break;
- }
-
- // print element with attributes
- case Node.ELEMENT_NODE: {
- System.out.print("<");
- System.out.print(node.getNodeName());
- NamedNodeMap attrs = node.getAttributes();
- for (int i = 0; i < attrs.getLength(); i++) {
- Node attr = attrs.item(i);
- System.out.print(" " + attr.getNodeName().trim() + "=\""
- + attr.getNodeValue().trim() + "\"");
- }
- System.out.println(">");
-
- NodeList children = node.getChildNodes();
- if (children != null) {
- int len = children.getLength();
- for (int i = 0; i < len; i++)
- printDOM(children.item(i));
- }
-
- break;
- }
-
- // handle entity reference nodes
- case Node.ENTITY_REFERENCE_NODE: {
- System.out.print("&");
- System.out.print(node.getNodeName().trim());
- System.out.print(";");
- break;
- }
-
- // print cdata sections
- case Node.CDATA_SECTION_NODE: {
- System.out.print("<![CDATA[");
- System.out.print(node.getNodeValue().trim());
- System.out.print("]]>");
- break;
- }
-
- // print text
- case Node.TEXT_NODE: {
- System.out.print(node.getNodeValue().trim());
- break;
- }
-
- // print processing instruction
- case Node.PROCESSING_INSTRUCTION_NODE: {
- System.out.print("<?");
- System.out.print(node.getNodeName().trim());
- String data = node.getNodeValue().trim();
- {
- System.out.print(" ");
- System.out.print(data);
- }
- System.out.print("?>");
- break;
- }
- }
-
- if (type == Node.ELEMENT_NODE) {
- System.out.println();
- System.out.print("</");
- System.out.print(node.getNodeName().trim());
- System.out.print('>');
- }
}
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/parser/AutoDetectParser.java Sun Nov 4 01:12:30 2007
@@ -110,7 +110,6 @@
String typename = metadata.get(Metadata.CONTENT_TYPE);
if (typename != null) {
try {
- typename = MimeType.clean(typename);
type = types.forName(typename);
} catch (MimeTypeException e) {
// Malformed type name, ignore
@@ -140,7 +139,11 @@
// Finally, use the default type if no matches found
if (type == null) {
- type = types.forName(MimeTypes.DEFAULT);
+ try {
+ type = types.forName(MimeTypes.DEFAULT);
+ } catch (MimeTypeException e) {
+ // Should never happen
+ }
}
return type;
Modified: incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml (original)
+++ incubator/tika/trunk/src/main/resources/mime/tika-mimetypes.xml Sun Nov 4 01:12:30 2007
@@ -523,7 +523,7 @@
</mime-type> -->
<mime-type type="application/x-ms-dos-executable">
- <alias type="application/x-dosexec;exe" />
+ <alias type="application/x-dosexec" />
</mime-type>
<mime-type type="application/ogg">
Added: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java?rev=591743&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java Sun Nov 4 01:12:30 2007
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import junit.framework.TestCase;
+
+public class MimeTypeTest extends TestCase {
+
+ public void testIsValidName() {
+ assertTrue(MimeType.isValid("application/octet-stream"));
+ assertTrue(MimeType.isValid("text/plain"));
+ assertTrue(MimeType.isValid("foo/bar"));
+ assertTrue(MimeType.isValid("a/b"));
+
+ assertFalse(MimeType.isValid("application"));
+ assertFalse(MimeType.isValid("application/"));
+ assertFalse(MimeType.isValid("/"));
+ assertFalse(MimeType.isValid("/octet-stream"));
+ assertFalse(MimeType.isValid("application//octet-stream"));
+ assertFalse(MimeType.isValid("application/octet=stream"));
+ assertFalse(MimeType.isValid("application/\u00f6ctet-stream"));
+ assertFalse(MimeType.isValid("text/plain;"));
+ assertFalse(MimeType.isValid("text/plain; charset=UTF-8"));
+ }
+
+}
Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypeTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java?rev=591743&view=auto
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java (added)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java Sun Nov 4 01:12:30 2007
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.mime;
+
+import junit.framework.TestCase;
+
+public class MimeTypesTest extends TestCase {
+
+ private MimeTypes types;
+
+ protected void setUp() {
+ types = new MimeTypes();
+ }
+
+ public void testForName() throws MimeTypeException {
+ assertNotNull(types.forName("text/plain"));
+ assertEquals("text/plain", types.forName("text/plain").getName());
+ assertEquals("text/plain", types.forName("TEXT/PLAIN").getName());
+
+ try {
+ types.forName("invalid");
+ fail("MimeTypeException not thrown on invalid type name");
+ } catch (MimeTypeException e) {
+ // expected
+ }
+ }
+
+ public void addAlias() throws MimeTypeException {
+ types.addAlias(types.forName("text/plain"), "foo/bar");
+ assertNotNull(types.forName("foo/bar"));
+ assertEquals("text/plain", types.forName("foo/bar").getName());
+
+ try {
+ types.addAlias(types.forName("text/plain"), "invalid");
+ fail("MimeTypeException not thrown on invalid alias name");
+ } catch (MimeTypeException e) {
+ // expected
+ }
+ }
+}
Propchange: incubator/tika/trunk/src/test/java/org/apache/tika/mime/MimeTypesTest.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java
URL: http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java?rev=591743&r1=591742&r2=591743&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java (original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/mime/TestMimeTypes.java Sun Nov 4 01:12:30 2007
@@ -68,7 +68,7 @@
assertEquals(repo.getMimeType("test.pdF"), type);
}
- public void testLoadMimeTypes() {
+ public void testLoadMimeTypes() throws MimeTypeException {
assertNotNull(repo.forName("application/octet-stream"));
assertNotNull(repo.forName("text/x-tex"));
}
@@ -136,6 +136,5 @@
return type;
}
-
}