You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/12/05 13:38:21 UTC
svn commit: r1042335 - in
/tika/trunk/tika-core/src/main/java/org/apache/tika: Tika.java
mime/MimeTypes.java
Author: jukka
Date: Sun Dec 5 12:38:21 2010
New Revision: 1042335
URL: http://svn.apache.org/viewvc?rev=1042335&view=rev
Log:
TIKA-566: Better convenience methods for type detection
Add a few new Tika.detect() convenience methods.
Deprecate the MimeTypes.getMimeType() methods in favor of the Tika facade.
Fix the detection logic in getMimeType(String, byte[])
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=1042335&r1=1042334&r2=1042335&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sun Dec 5 12:38:21 2010
@@ -138,6 +138,30 @@ public class Tika {
/**
* Detects the media type of the given document. The type detection is
+ * based on the content of the given document stream and the name of the
+ * document.
+ * <p>
+ * If the document stream supports the
+ * {@link InputStream#markSupported() mark feature}, then the stream is
+ * marked and reset to the original position before this method returns.
+ * Only a limited number of bytes are read from the stream.
+ * <p>
+ * The given document stream is <em>not</em> closed by this method.
+ *
+ * @since Apache Tika 0.9
+ * @param stream the document stream
+ * @param name document name
+ * @return detected media type
+ * @throws IOException if the stream can not be read
+ */
+ public String detect(InputStream stream, String name) throws IOException {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ return detect(stream, metadata);
+ }
+
+ /**
+ * Detects the media type of the given document. The type detection is
* based on the content of the given document stream.
* <p>
* If the document stream supports the
@@ -156,6 +180,49 @@ public class Tika {
}
/**
+ * Detects the media type of the given document. The type detection is
+ * based on the first few bytes of a document and the document name.
+ * <p>
+ * For best results at least a few kilobytes of the document data
+ * are needed. See also the other detect() methods for better
+ * alternatives when you have more than just the document prefix
+ * available for type detection.
+ *
+ * @since Apache Tika 0.9
+ * @param prefix first few bytes of the document
+ * @param name document name
+ * @return detected media type
+ */
+ public String detect(byte[] prefix, String name) {
+ try {
+ return detect(TikaInputStream.get(prefix), name);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unexpected IOException", e);
+ }
+ }
+
+ /**
+ * Detects the media type of the given document. The type detection is
+ * based on the first few bytes of a document.
+ * <p>
+ * For best results at least a few kilobytes of the document data
+ * are needed. See also the other detect() methods for better
+ * alternatives when you have more than just the document prefix
+ * available for type detection.
+ *
+ * @since Apache Tika 0.9
+ * @param prefix first few bytes of the document
+ * @return detected media type
+ */
+ public String detect(byte[] prefix) {
+ try {
+ return detect(TikaInputStream.get(prefix));
+ } catch (IOException e) {
+ throw new IllegalStateException("Unexpected IOException", e);
+ }
+ }
+
+ /**
* Detects the media type of the given file. The type detection is
* based on the document content and a potential known file extension.
* <p>
@@ -203,10 +270,8 @@ public class Tika {
* @return detected media type
*/
public String detect(String name) {
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
try {
- return detect(null, metadata);
+ return detect((InputStream) null, name);
} catch (IOException e) {
throw new IllegalStateException("Unexpected IOException", e);
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java?rev=1042335&r1=1042334&r2=1042335&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java Sun Dec 5 12:38:21 2010
@@ -33,12 +33,11 @@ import java.util.SortedSet;
import java.util.TreeSet;
import javax.xml.namespace.QName;
-import javax.xml.parsers.ParserConfigurationException;
+import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.metadata.Metadata;
-import org.xml.sax.SAXException;
/**
* This class is a MimeType repository. It gathers a set of MimeTypes and
@@ -157,6 +156,7 @@ public final class MimeTypes implements
/**
* Find the Mime Content Type of a file.
*
+ * @deprecated Use the {@link Tika#detect(File)} method
* @param file
* to analyze.
* @return the Mime Content Type of the specified file, or <code>null</code>
@@ -169,6 +169,7 @@ public final class MimeTypes implements
/**
* Find the Mime Content Type of a document from its URL.
*
+ * @deprecated Use the {@link Tika#detect(URL)} method
* @param url
* of the document to analyze.
* @return the Mime Content Type of the specified document URL, or
@@ -182,6 +183,7 @@ public final class MimeTypes implements
* Find the Mime Content Type of a document from its name.
* Returns application/octet-stream if no better match is found.
*
+ * @deprecated Use the {@link Tika#detect(String)} method
* @param name of the document to analyze.
* @return the Mime Content Type of the specified document name
*/
@@ -206,6 +208,7 @@ public final class MimeTypes implements
* The given byte array is expected to be at least {@link #getMinLength()}
* long, or shorter only if the document stream itself is shorter.
*
+ * @deprecated Use the {@link Tika#detect(byte[])} method
* @param data first few bytes of a document stream
* @return matching MIME type
*/
@@ -267,7 +270,7 @@ public final class MimeTypes implements
* Returns the MIME type that best matches the first few bytes of the
* given document stream.
*
- * @see #getMimeType(byte[])
+ * @deprecated Use the {@link Tika#detect(InputStream)} method
* @param stream document stream
* @return matching MIME type, or <code>null</code> if no match is found
* @throws IOException if the stream can be read
@@ -311,6 +314,9 @@ public final class MimeTypes implements
return shorter;
}
+ /**
+ * @deprecated Use the {@link Tika#detect(InputStream, Metadata))} method
+ */
public String getType(String typeName, String url, byte[] data) {
try {
Metadata metadata = new Metadata();
@@ -333,6 +339,7 @@ public final class MimeTypes implements
* from the header, guesses the MIME type from the URL extension
* (e.g. "pdf).
*
+ * @deprecated Use the {@link Tika#detect(URL)} method
* @param url URL of the document
* @return type of the document
* @throws IOException if the document can not be accessed
@@ -357,6 +364,8 @@ public final class MimeTypes implements
* based on the file name</li>
* </ol>
*
+ *
+ * @deprecated Use the {@link Tika#detect(byte[], String)} method
* @param name
* of the document to analyze.
* @param data
@@ -367,21 +376,25 @@ public final class MimeTypes implements
*/
public MimeType getMimeType(String name, byte[] data) {
// First, try to get the mime-type from the content
- MimeType mimeType = getMimeType(data);
+ MimeType dataType = getMimeType(data);
- // If no mime-type found, then try to get the mime-type from
- // the document name
- if (mimeType == null) {
- mimeType = getMimeType(name);
- }
+ // Then, try to get the mime-type from the document name
+ MimeType nameType = getMimeType(name);
- return mimeType;
+ // Use the more specific of the two types
+ if (registry.isSpecializationOf(
+ nameType.getType(), dataType.getType())) {
+ return nameType;
+ } else {
+ return dataType;
+ }
}
/**
* Returns the MIME type that best matches the given document name and
* the first few bytes of the given document stream.
*
+ * @deprecated Use the {@link Tika#detect(InputStream,String)} method
* @see #getMimeType(String, byte[])
* @param name document name
* @param stream document stream