You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/09/06 22:18:10 UTC

svn commit: r993143 - /tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java

Author: jukka
Date: Mon Sep  6 20:18:10 2010
New Revision: 993143

URL: http://svn.apache.org/viewvc?rev=993143&view=rev
Log:
TIKA-153: Allow passing of files or memory buffers to parsers

Improved TikaInputStream javadocs. Minor fixes and improvements.

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=993143&r1=993142&r2=993143&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Mon Sep  6 20:18:10 2010
@@ -20,6 +20,7 @@ import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -32,6 +33,24 @@ import java.net.URLConnection;
 import org.apache.tika.metadata.Metadata;
 
 /**
+ * Input stream with extended capabilities. The purpose of this class is
+ * to allow files and other resources and information to be associated with
+ * the {@link InputStream} instance passed through the
+ * {@link org.apache.tika.parser.Parser} interface and other similar APIs.
+ * <p>
+ * TikaInputStream instances can be created using the various static
+ * <code>get()</code> factory methods. Most of these methods take an optional
+ * {@link Metadata} argument that is then filled with the available input
+ * metadata from the given resource. The created TikaInputStream instance
+ * keeps track of the original resource used to create it, while behaving
+ * otherwise just like a normal, buffered {@link InputStream}.
+ * A TikaInputStream instance is also guaranteed to support the
+ * {@link #mark(int)} feature.
+ * <p>
+ * Code that wants to access the underlying file or other resources
+ * associated with a TikaInputStream should first use the
+ * {@link #get(InputStream)} factory method to cast or wrap a given
+ * {@link InputStream} into a TikaInputStream instance.
  *
  * @since Apache Tika 0.8
  */
@@ -67,17 +86,55 @@ public class TikaInputStream extends Pro
         }
     }
 
-    public static TikaInputStream get(byte[] data) throws IOException {
+    /**
+     * Creates a TikaInputStream from the given array of bytes.
+     *
+     * @param data input data
+     * @return a TikaInputStream instance
+     * @throws IOException
+     */
+    public static TikaInputStream get(byte[] data) {
+        return get(data, new Metadata());
+    }
+
+    /**
+     * Creates a TikaInputStream from the given array of bytes. The length of
+     * the array is stored as input metadata in the given metadata instance.
+     *
+     * @param data input data
+     * @param metadata metadata instance
+     * @return a TikaInputStream instance
+     * @throws IOException
+     */
+    public static TikaInputStream get(byte[] data, Metadata metadata) {
+        metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
+
         return new TikaInputStream(
                 new ByteArrayInputStream(data), null, data.length);
     }
 
-    public static TikaInputStream get(File file) throws IOException {
+    /**
+     * Creates a TikaInputStream from the given file.
+     *
+     * @param file input file
+     * @return a TikaInputStream instance
+     * @throws FileNotFoundException if the file does not exist
+     */
+    public static TikaInputStream get(File file) throws FileNotFoundException {
         return get(file, new Metadata());
     }
 
+    /**
+     * Creates a TikaInputStream from the given file. The file name and
+     * length are stored as input metadata in the given metadata instance.
+     *
+     * @param file input file
+     * @param metadata metadata instance
+     * @return a TikaInputStream instance
+     * @throws FileNotFoundException if the file does not exist
+     */
     public static TikaInputStream get(File file, Metadata metadata)
-            throws IOException {
+            throws FileNotFoundException {
         metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.length()));
 
@@ -87,15 +144,25 @@ public class TikaInputStream extends Pro
     }
 
     /**
-     * 
-     * @param uri
-     * @return
-     * @throws IOException
+     * Creates a TikaInputStream from the resource at the given URI.
+     *
+     * @param uri resource URI
+     * @return a TikaInputStream instance
+     * @throws IOException if the resource can not be accessed
      */
     public static TikaInputStream get(URI uri) throws IOException {
         return get(uri, new Metadata());
     }
 
+    /**
+     * Creates a TikaInputStream from the resource at the given URI. The
+     * available input metadata is stored in the given metadata instance.
+     *
+     * @param uri resource URI
+     * @param metadata metadata instance
+     * @return a TikaInputStream instance
+     * @throws IOException if the resource can not be accessed
+     */
     public static TikaInputStream get(URI uri, Metadata metadata)
             throws IOException {
         // Special handling for file:// URIs
@@ -109,10 +176,26 @@ public class TikaInputStream extends Pro
         return get(uri.toURL(), metadata);
     }
 
+    /**
+     * Creates a TikaInputStream from the resource at the given URL.
+     *
+     * @param url resource URL
+     * @return a TikaInputStream instance
+     * @throws IOException if the resource can not be accessed
+     */
     public static TikaInputStream get(URL url) throws IOException {
         return get(url, new Metadata());
     }
 
+    /**
+     * Creates a TikaInputStream from the resource at the given URL. The
+     * available input metadata is stored in the given metadata instance.
+     *
+     * @param url resource URL
+     * @param metadata metadata instance
+     * @return a TikaInputStream instance
+     * @throws IOException if the resource can not be accessed
+     */
     public static TikaInputStream get(URL url, Metadata metadata)
             throws IOException {
         // Special handling for file:// URLs
@@ -142,7 +225,7 @@ public class TikaInputStream extends Pro
 
         String encoding = connection.getContentEncoding();
         if (encoding != null) {
-            metadata.set(Metadata.CONTENT_TYPE, encoding);
+            metadata.set(Metadata.CONTENT_ENCODING, encoding);
         }
 
         int length = connection.getContentLength();
@@ -170,6 +253,9 @@ public class TikaInputStream extends Pro
      */
     private boolean temporary;
 
+    /**
+     * Total length of the stream, or -1 if unknown.
+     */
     private long length;
 
     /**
@@ -190,10 +276,12 @@ public class TikaInputStream extends Pro
     private Object openContainer;
 
     /**
-     * 
+     * Creates a TikaInputStream instance. This private constructor is used
+     * by the static factory methods based on the available information.
+     *
      * @param stream <em>buffered</em> stream (must support the mark feature)
-     * @param file
-     * @param length
+     * @param file the file that contains the stream, or <code>null</code>
+     * @param length total length of the stream, or -1 if unknown
      */
     private TikaInputStream(InputStream stream, File file, long length) {
         super(stream);