You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/28 11:12:23 UTC
svn commit: r938830 - in /lucene/tika/trunk/tika-core/src:
main/java/org/apache/tika/Tika.java
main/java/org/apache/tika/io/TikaInputStream.java
main/java/org/apache/tika/metadata/MetadataHelper.java
test/java/org/apache/tika/io/TikaInputStreamTest.java
Author: jukka
Date: Wed Apr 28 09:12:22 2010
New Revision: 938830
URL: http://svn.apache.org/viewvc?rev=938830&view=rev
Log:
TIKA-153: Allow passing of files or memory buffers to parsers
Improved URL and URI handling, first take at metadata support (deprecating MetadataHelper)
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Wed Apr 28 09:12:22 2010
@@ -26,8 +26,8 @@ import java.net.URL;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.MetadataHelper;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -161,7 +161,7 @@ public class Tika {
*/
public String detect(URL url) throws IOException {
Metadata metadata = new Metadata();
- InputStream stream = MetadataHelper.getInputStream(url, metadata);
+ InputStream stream = TikaInputStream.get(url, metadata);
try {
return detect(stream, metadata);
} finally {
@@ -238,7 +238,7 @@ public class Tika {
*/
public Reader parse(URL url) throws IOException {
Metadata metadata = new Metadata();
- InputStream stream = MetadataHelper.getInputStream(url, metadata);
+ InputStream stream = TikaInputStream.get(url, metadata);
return parse(stream, metadata);
}
@@ -329,7 +329,7 @@ public class Tika {
*/
public String parseToString(URL url) throws IOException, TikaException {
Metadata metadata = new Metadata();
- InputStream stream = MetadataHelper.getInputStream(url, metadata);
+ InputStream stream = TikaInputStream.get(url, metadata);
return parseToString(stream, metadata);
}
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Wed Apr 28 09:12:22 2010
@@ -23,7 +23,12 @@ import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
import java.net.URL;
+import java.net.URLConnection;
+
+import org.apache.tika.metadata.Metadata;
/**
*
@@ -43,8 +48,80 @@ public class TikaInputStream extends Pro
if (stream instanceof TikaInputStream) {
return (TikaInputStream) stream;
} else {
- return new TikaInputStream(stream);
+ return new TikaInputStream(stream, null, -1);
+ }
+ }
+
+ public static TikaInputStream get(byte[] data) throws IOException {
+ return new TikaInputStream(
+ new ByteArrayInputStream(data), null, data.length);
+ }
+
+ public static TikaInputStream get(File file) throws IOException {
+ return new TikaInputStream(
+ new FileInputStream(file), file, file.length());
+ }
+
+ /**
+ *
+ * @param uri
+ * @return
+ * @throws IOException
+ */
+ public static TikaInputStream get(URI uri) throws IOException {
+ // Special handling for file:// URIs
+ if ("file".equalsIgnoreCase(uri.getScheme())) {
+ File file = new File(uri);
+ if (file.isFile()) {
+ return get(file);
+ }
+ }
+
+ return get(uri.toURL());
+ }
+
+ public static TikaInputStream get(URL url) throws IOException {
+ return get(url, new Metadata());
+ }
+
+ public static TikaInputStream get(URL url, Metadata metadata)
+ throws IOException {
+ // Special handling for file:// URLs
+ if ("file".equalsIgnoreCase(url.getProtocol())) {
+ try {
+ File file = new File(url.toURI());
+ if (file.isFile()) {
+ return get(file);
+ }
+ } catch (URISyntaxException e) {
+ // fall through
+ }
+ }
+
+ URLConnection connection = url.openConnection();
+
+ String path = url.getPath();
+ int slash = path.lastIndexOf('/');
+ if (slash + 1 < path.length()) { // works even with -1!
+ metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
+ }
+
+ String type = connection.getContentType();
+ if (type != null) {
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ }
+
+ String encoding = connection.getContentEncoding();
+ if (encoding != null) {
+ metadata.set(Metadata.CONTENT_TYPE, encoding);
+ }
+
+ int length = connection.getContentLength();
+ if (length >= 0) {
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
}
+
+ return new TikaInputStream(connection.getInputStream(), null, length);
}
/**
@@ -76,22 +153,6 @@ public class TikaInputStream extends Pro
this.length = length;
}
- public TikaInputStream(InputStream stream) {
- this(stream, null, -1);
- }
-
- public TikaInputStream(byte[] data) {
- this(new ByteArrayInputStream(data), null, data.length);
- }
-
- public TikaInputStream(File file) throws IOException {
- this(new FileInputStream(file), file, file.length());
- }
-
- public TikaInputStream(URL url) throws IOException {
- this(url.openStream(), null, -1);
- }
-
public File getFile() throws IOException {
if (file == null) {
if (in == null) {
@@ -113,6 +174,16 @@ public class TikaInputStream extends Pro
return file;
}
+ /**
+ * Returns the length (in bytes) of this stream. Note that if the length
+ * was not available when this stream was instantiated, then this method
+ * will use the {@link #getFile()} method to buffer the entire stream to
+ * a temporary file in order to calculate the stream length. This case
+ * will only work if the stream has not yet been consumed.
+ *
+ * @return stream length
+ * @throws IOException if the length can not be determined
+ */
public long getLength() throws IOException {
if (length == -1) {
length = getFile().length();
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java Wed Apr 28 09:12:22 2010
@@ -19,13 +19,13 @@ package org.apache.tika.metadata;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import java.net.URLConnection;
import org.apache.tika.io.TikaInputStream;
/**
* Collection of static helper methods for handling metadata.
*
+ * @deprecated Use {@link TikaInputStream} instead
* @since Apache Tika 0.7
*/
public class MetadataHelper {
@@ -47,30 +47,7 @@ public class MetadataHelper {
*/
public static InputStream getInputStream(URL url, Metadata metadata)
throws IOException {
- URLConnection connection = url.openConnection();
-
- String path = url.getPath();
- int slash = path.lastIndexOf('/');
- if (slash + 1 < path.length()) { // works even with -1!
- metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
- }
-
- String type = connection.getContentType();
- if (type != null) {
- metadata.set(Metadata.CONTENT_TYPE, type);
- }
-
- String encoding = connection.getContentEncoding();
- if (encoding != null) {
- metadata.set(Metadata.CONTENT_TYPE, encoding);
- }
-
- int length = connection.getContentLength();
- if (length >= 0) {
- metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
- }
-
- return new TikaInputStream(connection.getInputStream());
+ return TikaInputStream.get(url, metadata);
}
}
Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java Wed Apr 28 09:12:22 2010
@@ -30,7 +30,7 @@ public class TikaInputStreamTest extends
public void testFileBased() throws IOException {
File file = createTempFile("Hello, World!");
- InputStream stream = new TikaInputStream(file);
+ InputStream stream = TikaInputStream.get(file);
assertEquals(
"The file returned by the getFile() method should"
@@ -54,7 +54,7 @@ public class TikaInputStreamTest extends
public void testStreamBased() throws IOException {
InputStream input =
new ByteArrayInputStream("Hello, World!".getBytes("UTF-8"));
- InputStream stream = new TikaInputStream(input);
+ InputStream stream = TikaInputStream.get(input);
File file = TikaInputStream.get(stream).getFile();
assertTrue(file != null && file.isFile());