You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/09/27 20:35:24 UTC
svn commit: r819367 -
/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
Author: jukka
Date: Sun Sep 27 18:35:24 2009
New Revision: 819367
URL: http://svn.apache.org/viewvc?rev=819367&view=rev
Log:
TIKA-269: Ease of use -facade for Tika
Add a set of detect() methods.
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=819367&r1=819366&r2=819367&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Sun Sep 27 18:35:24 2009
@@ -16,6 +16,7 @@
*/
package org.apache.tika;
+import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
@@ -27,6 +28,7 @@
import java.util.Map;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@@ -39,22 +41,31 @@
/**
* Facade class for accessing Tika functionality. This class hides much of
* the underlying complexity of the lower level Tika classes and provides
- * simple methods for many common parsing operations.
+ * simple methods for many common parsing and type detection operations.
*
* @since Apache Tika 0.5
+ * @see Parser
+ * @see Detector
*/
public class Tika {
/**
+ * The detector instance used by this facade.
+ */
+ private final Detector detector;
+
+ /**
* The parser instance used by this facade.
*/
private final Parser parser;
/**
* Creates a Tika facade using the given configuration.
- * @param config
+ *
+ * @param config Tika configuration
*/
public Tika(TikaConfig config) {
+ this.detector = config.getMimeRepository();
this.parser = new AutoDetectParser(config);
}
@@ -66,6 +77,119 @@
}
/**
+ * Detects the media type of the given document. The type detection is
+ * based on the content of the given document stream and any given
+ * document metadata. The document stream can be <code>null</code>,
+ * in which case only the given document metadata is used for type
+ * detection.
+ * <p>
+ * If the document stream supports the
+ * {@link InputStream#markSupported() mark feature}, then the stream is
+ * marked and reset to the original position before this method returns.
+ * Only a limited number of bytes are read from the stream.
+ * <p>
+ * The given document stream is <em>not</em> closed by this method.
+ * <p>
+ * Unlike in the {@link #parse(InputStream, Metadata)} method, the
+ * given document metadata is <em>not</em> modified by this method.
+ *
+ * @param stream the document stream, or <code>null</code>
+ * @param metadata document metadata
+ * @return detected media type
+ * @throws IOException if the stream can not be read
+ */
+ public String detect(InputStream stream, Metadata metadata)
+ throws IOException {
+ if (stream.markSupported()) {
+ return detector.detect(stream, metadata).toString();
+ } else {
+ return detector.detect(
+ new BufferedInputStream(stream), metadata).toString();
+ }
+ }
+
+ /**
+ * Detects the media type of the given document. The type detection is
+ * based on the content of the given document stream.
+ * <p>
+ * If the document stream supports the
+ * {@link InputStream#markSupported() mark feature}, then the stream is
+ * marked and reset to the original position before this method returns.
+ * Only a limited number of bytes are read from the stream.
+ * <p>
+ * The given document stream is <em>not</em> closed by this method.
+ *
+ * @param stream the document stream
+ * @return detected media type
+ * @throws IOException if the stream can not be read
+ */
+ public String detect(InputStream stream) throws IOException {
+ return detect(stream, new Metadata());
+ }
+
+ /**
+ * Detects the media type of the given file. The type detection is
+ * based on the document content and a potential known file extension.
+ * <p>
+ * Use the {@link #detect(String)} method when you want to detect the
+ * type of the document without actually accessing the file.
+ *
+ * @param file the file
+ * @return detected media type
+ * @throws FileNotFoundException if the file does not exist
+ * @throws IOException if the file can not be read
+ */
+ public String detect(File file) throws FileNotFoundException, IOException {
+ InputStream stream = new FileInputStream(file);
+ try {
+ return detect(stream, getFileMetadata(file));
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Detects the media type of the resource at the given URL. The type
+ * detection is based on the document content and a potential known
+ * file extension included in the URL.
+ * <p>
+ * Use the {@link #detect(String)} method when you want to detect the
+ * type of the document without actually accessing the URL.
+ *
+ * @param url the URL of the resource
+ * @return detected media type
+ * @throws IOException if the resource can not be read
+ */
+ public String detect(URL url) throws IOException {
+ InputStream stream = url.openStream();
+ try {
+ return detect(stream, getUrlMetadata(url));
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Detects the media type of a document with the given file name.
+ * The type detection is based on known file name extensions.
+ * <p>
+ * The given name can also be a URL or a full file path. In such cases
+ * only the file name part of the string is used for type detection.
+ *
+ * @param name the file name of the document
+ * @return detected media type
+ */
+ public String detect(String name) {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ try {
+ return detect(null, metadata);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unexpected IOException", e);
+ }
+ }
+
+ /**
* Parses the given document and returns the extracted text content.
* Input metadata like a file name or a content type hint can be passed
* in the given metadata instance. Metadata information extracted from