You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/14 19:56:37 UTC

svn commit: r934061 - in /lucene/tika/trunk/tika-core/src/main/java/org/apache/tika: io/ProxyInputStream.java io/TikaInputStream.java metadata/MetadataHelper.java

Author: jukka
Date: Wed Apr 14 17:56:37 2010
New Revision: 934061

URL: http://svn.apache.org/viewvc?rev=934061&view=rev
Log:
TIKA-153: Allow passing of files or memory buffers to parsers

First take on the proposed TikaInputStream class. Work in progress...

Added:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java   (with props)
Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java?rev=934061&r1=934060&r2=934061&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java Wed Apr 14 17:56:37 2010
@@ -28,6 +28,9 @@ import java.io.InputStream;
  * It is an alternative base class to FilterInputStream
  * to increase reusability, because FilterInputStream changes the
  * methods being called, such as read(byte[]) to read(byte[], int, int).
+ * <p>
+ * See the protected methods for ways in which a subclass can easily decorate
+ * a stream with custom pre-, post- or error processing functionality.
  *
  * @author Stephen Colebourne
  * @version $Id$
@@ -52,7 +55,10 @@ public abstract class ProxyInputStream e
     @Override
     public int read() throws IOException {
         try {
-            return in.read();
+            beforeRead(1);
+            int b = in.read();
+            afterRead(b != -1 ? 1 : -1);
+            return b;
         } catch (IOException e) {
             handleIOException(e);
             return -1;
@@ -68,7 +74,10 @@ public abstract class ProxyInputStream e
     @Override
     public int read(byte[] bts) throws IOException {
         try {
-            return in.read(bts);
+            beforeRead(bts.length);
+            int n = in.read(bts);
+            afterRead(n);
+            return n;
         } catch (IOException e) {
             handleIOException(e);
             return -1;
@@ -86,7 +95,10 @@ public abstract class ProxyInputStream e
     @Override
     public int read(byte[] bts, int off, int len) throws IOException {
         try {
-            return in.read(bts, off, len);
+            beforeRead(len);
+            int n = in.read(bts, off, len);
+            afterRead(n);
+            return n;
         } catch (IOException e) {
             handleIOException(e);
             return -1;
@@ -96,7 +108,7 @@ public abstract class ProxyInputStream e
     /**
      * Invokes the delegate's <code>skip(long)</code> method.
      * @param ln the number of bytes to skip
-     * @return the number of bytes to skipped or -1 if the end of stream
+     * @return the actual number of bytes skipped
      * @throws IOException if an I/O error occurs
      */
     @Override
@@ -168,6 +180,46 @@ public abstract class ProxyInputStream e
         return in.markSupported();
     }
 
+    /**
+     * Invoked by the read methods before the call is proxied. The number
+     * of bytes that the caller wanted to read (1 for the {@link #read()}
+     * method, buffer length for {@link #read(byte[])}, etc.) is given as
+     * an argument.
+     * <p>
+     * Subclasses can override this method to add common pre-processing
+     * functionality without having to override all the read methods.
+     * The default implementation does nothing.
+     * <p>
+     * Note this method is <em>not</em> called from {@link #skip(long)} or
+     * {@link #reset()}. You need to explicitly override those methods if
+     * you want to add pre-processing steps also to them.
+     *
+     * @since Commons IO 2.0
+     * @param n number of bytes that the caller asked to be read
+     * @throws IOException if the pre-processing fails
+     */
+    protected void beforeRead(int n) throws IOException {
+    }
+
+    /**
+     * Invoked by the read methods after the proxied call has returned
+     * successfully. The number of bytes returned to the caller (or -1 if
+     * the end of stream was reached) is given as an argument.
+     * <p>
+     * Subclasses can override this method to add common post-processing
+     * functionality without having to override all the read methods.
+     * The default implementation does nothing.
+     * <p>
+     * Note this method is <em>not</em> called from {@link #skip(long)} or
+     * {@link #reset()}. You need to explicitly override those methods if
+     * you want to add post-processing steps also to them.
+     *
+     * @since Commons IO 2.0
+     * @param n number of bytes read, or -1 if the end of stream was reached
+     * @throws IOException if the post-processing fails
+     */
+    protected void afterRead(int n) throws IOException {
+    }
 
     /**
      * Handle any IOExceptions thrown.
@@ -176,6 +228,7 @@ public abstract class ProxyInputStream e
      * handling. The default behaviour is to re-throw the exception.
      * @param e The IOException thrown
      * @throws IOException if an I/O error occurs
+     * @since Commons IO 2.0
      */
     protected void handleIOException(IOException e) throws IOException {
         throw e;

Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=934061&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Wed Apr 14 17:56:37 2010
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+public class TikaInputStream extends ProxyInputStream {
+
+    public static TikaInputStream get(InputStream stream) {
+        if (stream instanceof TikaInputStream) {
+            return (TikaInputStream) stream;
+        } else {
+            return new TikaInputStream(stream);
+        }
+    }
+
+    private File file;
+
+    private boolean temporary;
+
+    private long position = 0;
+
+    public TikaInputStream(InputStream stream) {
+        super(stream);
+        this.file = null;
+        this.temporary = true;
+    }
+
+    public TikaInputStream(File file) {
+        super(null);
+        this.file = file;
+        this.temporary = false;
+    }
+
+    public File getFile() throws IOException {
+        if (file == null) {
+            if (in == null) {
+                throw new IOException("Stream has already been read");
+            } else if (position > 0) {
+                throw new IOException("Stream is already being read");
+            } else {
+                file = File.createTempFile("apache-tika-", ".tmp");
+                OutputStream out = new FileOutputStream(file);
+                try {
+                    IOUtils.copy(in, out);
+                } finally {
+                    out.close();
+                }
+                in.close();
+                in = null;
+            }
+        }
+        return file;
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (in != null) {
+            in.close();
+            in = null;
+        }
+        if (file != null) {
+            if (temporary) {
+                file.delete();
+            }
+            file = null;
+        }
+    }
+
+    @Override
+    protected void beforeRead(int n) throws IOException {
+        if (in == null) {
+            if (file != null) {
+                in = new FileInputStream(file);
+            } else {
+                throw new IOException("End of the stream reached");
+            }
+        }
+    }
+
+    @Override
+    protected void afterRead(int n) throws IOException {
+        if (n != -1) {
+            position += n;
+        } else {
+            close();
+        }
+    }
+
+}

Propchange: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=934061&r1=934060&r2=934061&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java Wed Apr 14 17:56:37 2010
@@ -21,6 +21,8 @@ import java.io.InputStream;
 import java.net.URL;
 import java.net.URLConnection;
 
+import org.apache.tika.io.TikaInputStream;
+
 /**
  * Collection of static helper methods for handling metadata.
  *
@@ -68,7 +70,7 @@ public class MetadataHelper {
             metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
         }
 
-        return connection.getInputStream();
+        return new TikaInputStream(connection.getInputStream());
     }
 
 }