You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/14 19:56:37 UTC
svn commit: r934061 - in
/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika:
io/ProxyInputStream.java io/TikaInputStream.java metadata/MetadataHelper.java
Author: jukka
Date: Wed Apr 14 17:56:37 2010
New Revision: 934061
URL: http://svn.apache.org/viewvc?rev=934061&view=rev
Log:
TIKA-153: Allow passing of files or memory buffers to parsers
First take on the proposed TikaInputStream class. Work in progress...
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (with props)
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java?rev=934061&r1=934060&r2=934061&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/ProxyInputStream.java Wed Apr 14 17:56:37 2010
@@ -28,6 +28,9 @@ import java.io.InputStream;
* It is an alternative base class to FilterInputStream
* to increase reusability, because FilterInputStream changes the
* methods being called, such as read(byte[]) to read(byte[], int, int).
+ * <p>
+ * See the protected methods for ways in which a subclass can easily decorate
+ * a stream with custom pre-, post- or error processing functionality.
*
* @author Stephen Colebourne
* @version $Id$
@@ -52,7 +55,10 @@ public abstract class ProxyInputStream e
@Override
public int read() throws IOException {
try {
- return in.read();
+ beforeRead(1);
+ int b = in.read();
+ afterRead(b != -1 ? 1 : -1);
+ return b;
} catch (IOException e) {
handleIOException(e);
return -1;
@@ -68,7 +74,10 @@ public abstract class ProxyInputStream e
@Override
public int read(byte[] bts) throws IOException {
try {
- return in.read(bts);
+ beforeRead(bts.length);
+ int n = in.read(bts);
+ afterRead(n);
+ return n;
} catch (IOException e) {
handleIOException(e);
return -1;
@@ -86,7 +95,10 @@ public abstract class ProxyInputStream e
@Override
public int read(byte[] bts, int off, int len) throws IOException {
try {
- return in.read(bts, off, len);
+ beforeRead(len);
+ int n = in.read(bts, off, len);
+ afterRead(n);
+ return n;
} catch (IOException e) {
handleIOException(e);
return -1;
@@ -96,7 +108,7 @@ public abstract class ProxyInputStream e
/**
* Invokes the delegate's <code>skip(long)</code> method.
* @param ln the number of bytes to skip
- * @return the number of bytes to skipped or -1 if the end of stream
+ * @return the actual number of bytes skipped
* @throws IOException if an I/O error occurs
*/
@Override
@@ -168,6 +180,46 @@ public abstract class ProxyInputStream e
return in.markSupported();
}
+ /**
+ * Invoked by the read methods before the call is proxied. The number
+ * of bytes that the caller wanted to read (1 for the {@link #read()}
+ * method, buffer length for {@link #read(byte[])}, etc.) is given as
+ * an argument.
+ * <p>
+ * Subclasses can override this method to add common pre-processing
+ * functionality without having to override all the read methods.
+ * The default implementation does nothing.
+ * <p>
+ * Note this method is <em>not</em> called from {@link #skip(long)} or
+ * {@link #reset()}. You need to explicitly override those methods if
+ * you want to add pre-processing steps also to them.
+ *
+ * @since Commons IO 2.0
+ * @param n number of bytes that the caller asked to be read
+ * @throws IOException if the pre-processing fails
+ */
+ protected void beforeRead(int n) throws IOException {
+ }
+
+ /**
+ * Invoked by the read methods after the proxied call has returned
+ * successfully. The number of bytes returned to the caller (or -1 if
+ * the end of stream was reached) is given as an argument.
+ * <p>
+ * Subclasses can override this method to add common post-processing
+ * functionality without having to override all the read methods.
+ * The default implementation does nothing.
+ * <p>
+ * Note this method is <em>not</em> called from {@link #skip(long)} or
+ * {@link #reset()}. You need to explicitly override those methods if
+ * you want to add post-processing steps also to them.
+ *
+ * @since Commons IO 2.0
+ * @param n number of bytes read, or -1 if the end of stream was reached
+ * @throws IOException if the post-processing fails
+ */
+ protected void afterRead(int n) throws IOException {
+ }
/**
* Handle any IOExceptions thrown.
@@ -176,6 +228,7 @@ public abstract class ProxyInputStream e
* handling. The default behaviour is to re-throw the exception.
* @param e The IOException thrown
* @throws IOException if an I/O error occurs
+ * @since Commons IO 2.0
*/
protected void handleIOException(IOException e) throws IOException {
throw e;
Added: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=934061&view=auto
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (added)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Wed Apr 14 17:56:37 2010
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.io;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+public class TikaInputStream extends ProxyInputStream {
+
+ public static TikaInputStream get(InputStream stream) {
+ if (stream instanceof TikaInputStream) {
+ return (TikaInputStream) stream;
+ } else {
+ return new TikaInputStream(stream);
+ }
+ }
+
+ private File file;
+
+ private boolean temporary;
+
+ private long position = 0;
+
+ public TikaInputStream(InputStream stream) {
+ super(stream);
+ this.file = null;
+ this.temporary = true;
+ }
+
+ public TikaInputStream(File file) {
+ super(null);
+ this.file = file;
+ this.temporary = false;
+ }
+
+ public File getFile() throws IOException {
+ if (file == null) {
+ if (in == null) {
+ throw new IOException("Stream has already been read");
+ } else if (position > 0) {
+ throw new IOException("Stream is already being read");
+ } else {
+ file = File.createTempFile("apache-tika-", ".tmp");
+ OutputStream out = new FileOutputStream(file);
+ try {
+ IOUtils.copy(in, out);
+ } finally {
+ out.close();
+ }
+ in.close();
+ in = null;
+ }
+ }
+ return file;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (in != null) {
+ in.close();
+ in = null;
+ }
+ if (file != null) {
+ if (temporary) {
+ file.delete();
+ }
+ file = null;
+ }
+ }
+
+ @Override
+ protected void beforeRead(int n) throws IOException {
+ if (in == null) {
+ if (file != null) {
+ in = new FileInputStream(file);
+ } else {
+ throw new IOException("End of the stream reached");
+ }
+ }
+ }
+
+ @Override
+ protected void afterRead(int n) throws IOException {
+ if (n != -1) {
+ position += n;
+ } else {
+ close();
+ }
+ }
+
+}
Propchange: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=934061&r1=934060&r2=934061&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java Wed Apr 14 17:56:37 2010
@@ -21,6 +21,8 @@ import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
+import org.apache.tika.io.TikaInputStream;
+
/**
* Collection of static helper methods for handling metadata.
*
@@ -68,7 +70,7 @@ public class MetadataHelper {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
}
- return connection.getInputStream();
+ return new TikaInputStream(connection.getInputStream());
}
}