You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/05 13:38:21 UTC

svn commit: r1042335 - in /tika/trunk/tika-core/src/main/java/org/apache/tika: Tika.java mime/MimeTypes.java

Author: jukka
Date: Sun Dec  5 12:38:21 2010
New Revision: 1042335

URL: http://svn.apache.org/viewvc?rev=1042335&view=rev
Log:
TIKA-566: Better convenience methods for type detection

Add a few new Tika.detect() convenience methods.
Deprecate the MimeTypes.getMimeType() methods in favor of the Tika facade.
Fix the detection logic in getMimeType(String, byte[])

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=1042335&r1=1042334&r2=1042335&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sun Dec  5 12:38:21 2010
@@ -138,6 +138,30 @@ public class Tika {
 
     /**
      * Detects the media type of the given document. The type detection is
+     * based on the content of the given document stream and the name of the
+     * document.
+     * <p>
+     * If the document stream supports the
+     * {@link InputStream#markSupported() mark feature}, then the stream is
+     * marked and reset to the original position before this method returns.
+     * Only a limited number of bytes are read from the stream.
+     * <p>
+     * The given document stream is <em>not</em> closed by this method.
+     *
+     * @since Apache Tika 0.9
+     * @param stream the document stream
+     * @param name document name
+     * @return detected media type
+     * @throws IOException if the stream can not be read
+     */
+    public String detect(InputStream stream, String name) throws IOException {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        return detect(stream, metadata);
+    }
+
+    /**
+     * Detects the media type of the given document. The type detection is
      * based on the content of the given document stream.
      * <p>
      * If the document stream supports the
@@ -156,6 +180,49 @@ public class Tika {
     }
 
     /**
+     * Detects the media type of the given document. The type detection is
+     * based on the first few bytes of a document and the document name.
+     * <p>
+     * For best results at least a few kilobytes of the document data
+     * are needed. See also the other detect() methods for better
+     * alternatives when you have more than just the document prefix
+     * available for type detection.
+     *
+     * @since Apache Tika 0.9
+     * @param prefix first few bytes of the document
+     * @param name document name
+     * @return detected media type
+     */
+    public String detect(byte[] prefix, String name) {
+        try {
+            return detect(TikaInputStream.get(prefix), name);
+        } catch (IOException e) {
+            throw new IllegalStateException("Unexpected IOException", e);
+        }
+    }
+
+    /**
+     * Detects the media type of the given document. The type detection is
+     * based on the first few bytes of a document.
+     * <p>
+     * For best results at least a few kilobytes of the document data
+     * are needed. See also the other detect() methods for better
+     * alternatives when you have more than just the document prefix
+     * available for type detection.
+     *
+     * @since Apache Tika 0.9
+     * @param prefix first few bytes of the document
+     * @return detected media type
+     */
+    public String detect(byte[] prefix) {
+        try {
+            return detect(TikaInputStream.get(prefix));
+        } catch (IOException e) {
+            throw new IllegalStateException("Unexpected IOException", e);
+        }
+    }
+
+    /**
      * Detects the media type of the given file. The type detection is
      * based on the document content and a potential known file extension.
      * <p>
@@ -203,10 +270,8 @@ public class Tika {
      * @return detected media type
      */
     public String detect(String name) {
-        Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
         try {
-            return detect(null, metadata);
+            return detect((InputStream) null, name);
         } catch (IOException e) {
             throw new IllegalStateException("Unexpected IOException", e);
         }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1042335&r1=1042334&r2=1042335&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Dec  5 12:38:21 2010
@@ -33,12 +33,11 @@ import java.util.SortedSet;
 import java.util.TreeSet;
 
 import javax.xml.namespace.QName;
-import javax.xml.parsers.ParserConfigurationException;
 
+import org.apache.tika.Tika;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.detect.XmlRootExtractor;
 import org.apache.tika.metadata.Metadata;
-import org.xml.sax.SAXException;
 
 /**
  * This class is a MimeType repository. It gathers a set of MimeTypes and
@@ -157,6 +156,7 @@ public final class MimeTypes implements 
     /**
      * Find the Mime Content Type of a file.
      *
+     * @deprecated Use the {@link Tika#detect(File)} method
      * @param file
      *            to analyze.
      * @return the Mime Content Type of the specified file, or <code>null</code>
@@ -169,6 +169,7 @@ public final class MimeTypes implements 
     /**
      * Find the Mime Content Type of a document from its URL.
      *
+     * @deprecated Use the {@link Tika#detect(URL)} method
      * @param url
      *            of the document to analyze.
      * @return the Mime Content Type of the specified document URL, or
@@ -182,6 +183,7 @@ public final class MimeTypes implements 
      * Find the Mime Content Type of a document from its name.
      * Returns application/octet-stream if no better match is found.
      *
+     * @deprecated Use the {@link Tika#detect(String)} method
      * @param name of the document to analyze.
      * @return the Mime Content Type of the specified document name
      */
@@ -206,6 +208,7 @@ public final class MimeTypes implements 
      * The given byte array is expected to be at least {@link #getMinLength()}
      * long, or shorter only if the document stream itself is shorter.
      *
+     * @deprecated Use the {@link Tika#detect(byte[])} method
      * @param data first few bytes of a document stream
      * @return matching MIME type
      */
@@ -267,7 +270,7 @@ public final class MimeTypes implements 
      * Returns the MIME type that best matches the first few bytes of the
      * given document stream.
      *
-     * @see #getMimeType(byte[])
+     * @deprecated Use the {@link Tika#detect(InputStream)} method
      * @param stream document stream
      * @return matching MIME type, or <code>null</code> if no match is found
      * @throws IOException if the stream can be read
@@ -311,6 +314,9 @@ public final class MimeTypes implements 
         return shorter;
     }
 
+    /**
+     * @deprecated Use the {@link Tika#detect(InputStream, Metadata))} method
+     */
     public String getType(String typeName, String url, byte[] data) {
         try {
             Metadata metadata = new Metadata();
@@ -333,6 +339,7 @@ public final class MimeTypes implements 
      * from the header, guesses the MIME type from the URL extension
      * (e.g. "pdf).
      *
+     * @deprecated Use the {@link Tika#detect(URL)} method
      * @param url URL of the document
      * @return type of the document
      * @throws IOException if the document can not be accessed
@@ -357,6 +364,8 @@ public final class MimeTypes implements 
      * based on the file name</li>
      * </ol>
      *
+     *
+     * @deprecated Use the {@link Tika#detect(byte[], String)} method
      * @param name
      *            of the document to analyze.
      * @param data
@@ -367,21 +376,25 @@ public final class MimeTypes implements 
      */
     public MimeType getMimeType(String name, byte[] data) {
         // First, try to get the mime-type from the content
-        MimeType mimeType = getMimeType(data);
+        MimeType dataType = getMimeType(data);
 
-        // If no mime-type found, then try to get the mime-type from
-        // the document name
-        if (mimeType == null) {
-            mimeType = getMimeType(name);
-        }
+        // Then, try to get the mime-type from the document name
+        MimeType nameType = getMimeType(name);
 
-        return mimeType;
+        // Use the more specific of the two types
+        if (registry.isSpecializationOf(
+                nameType.getType(), dataType.getType())) {
+            return nameType;
+        } else {
+            return dataType;
+        }
     }
 
     /**
      * Returns the MIME type that best matches the given document name and
      * the first few bytes of the given document stream.
      *
+     * @deprecated Use the {@link Tika#detect(InputStream,String)} method
      * @see #getMimeType(String, byte[])
      * @param name document name
      * @param stream document stream