You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2010/04/28 11:12:23 UTC

svn commit: r938830 - in /lucene/tika/trunk/tika-core/src: main/java/org/apache/tika/Tika.java main/java/org/apache/tika/io/TikaInputStream.java main/java/org/apache/tika/metadata/MetadataHelper.java test/java/org/apache/tika/io/TikaInputStreamTest.java

Author: jukka
Date: Wed Apr 28 09:12:22 2010
New Revision: 938830

URL: http://svn.apache.org/viewvc?rev=938830&view=rev
Log:
TIKA-153: Allow passing of files or memory buffers to parsers

Improved URL and URI handling, first take at metadata support (deprecating MetadataHelper)

Modified:
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
    lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Wed Apr 28 09:12:22 2010
@@ -26,8 +26,8 @@ import java.net.URL;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.MetadataHelper;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -161,7 +161,7 @@ public class Tika {
      */
     public String detect(URL url) throws IOException {
         Metadata metadata = new Metadata();
-        InputStream stream = MetadataHelper.getInputStream(url, metadata);
+        InputStream stream = TikaInputStream.get(url, metadata);
         try {
             return detect(stream, metadata);
         } finally {
@@ -238,7 +238,7 @@ public class Tika {
      */
     public Reader parse(URL url) throws IOException {
         Metadata metadata = new Metadata();
-        InputStream stream = MetadataHelper.getInputStream(url, metadata);
+        InputStream stream = TikaInputStream.get(url, metadata);
         return parse(stream, metadata);
     }
 
@@ -329,7 +329,7 @@ public class Tika {
      */
     public String parseToString(URL url) throws IOException, TikaException {
         Metadata metadata = new Metadata();
-        InputStream stream = MetadataHelper.getInputStream(url, metadata);
+        InputStream stream = TikaInputStream.get(url, metadata);
         return parseToString(stream, metadata);
     }
 

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Wed Apr 28 09:12:22 2010
@@ -23,7 +23,12 @@ import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.net.URL;
+import java.net.URLConnection;
+
+import org.apache.tika.metadata.Metadata;
 
 /**
  *
@@ -43,8 +48,80 @@ public class TikaInputStream extends Pro
         if (stream instanceof TikaInputStream) {
             return (TikaInputStream) stream;
         } else {
-            return new TikaInputStream(stream);
+            return new TikaInputStream(stream, null, -1);
+        }
+    }
+
+    public static TikaInputStream get(byte[] data) throws IOException {
+        return new TikaInputStream(
+                new ByteArrayInputStream(data), null, data.length);
+    }
+
+    public static TikaInputStream get(File file) throws IOException {
+        return new TikaInputStream(
+                new FileInputStream(file), file, file.length());
+    }
+
+    /**
+     * 
+     * @param uri
+     * @return
+     * @throws IOException
+     */
+    public static TikaInputStream get(URI uri) throws IOException {
+        // Special handling for file:// URIs
+        if ("file".equalsIgnoreCase(uri.getScheme())) {
+            File file = new File(uri);
+            if (file.isFile()) {
+                return get(file);
+            }
+        }
+
+        return get(uri.toURL());
+    }
+
+    public static TikaInputStream get(URL url) throws IOException {
+        return get(url, new Metadata());
+    }
+
+    public static TikaInputStream get(URL url, Metadata metadata)
+            throws IOException {
+        // Special handling for file:// URLs
+        if ("file".equalsIgnoreCase(url.getProtocol())) {
+            try {
+                File file = new File(url.toURI());
+                if (file.isFile()) {
+                    return get(file);
+                }
+            } catch (URISyntaxException e) {
+                // fall through
+            }
+        }
+
+        URLConnection connection = url.openConnection();
+
+        String path = url.getPath();
+        int slash = path.lastIndexOf('/');
+        if (slash + 1 < path.length()) { // works even with -1!
+            metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
+        }
+
+        String type = connection.getContentType();
+        if (type != null) {
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        }
+
+        String encoding = connection.getContentEncoding();
+        if (encoding != null) {
+            metadata.set(Metadata.CONTENT_TYPE, encoding);
+        }
+
+        int length = connection.getContentLength();
+        if (length >= 0) {
+            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
         }
+
+        return new TikaInputStream(connection.getInputStream(), null, length);
     }
 
     /**
@@ -76,22 +153,6 @@ public class TikaInputStream extends Pro
         this.length = length;
     }
 
-    public TikaInputStream(InputStream stream) {
-        this(stream, null, -1);
-    }
-
-    public TikaInputStream(byte[] data) {
-        this(new ByteArrayInputStream(data), null, data.length);
-    }
-
-    public TikaInputStream(File file) throws IOException {
-        this(new FileInputStream(file), file, file.length());
-    }
-
-    public TikaInputStream(URL url) throws IOException {
-        this(url.openStream(), null, -1);
-    }
-
     public File getFile() throws IOException {
         if (file == null) {
             if (in == null) {
@@ -113,6 +174,16 @@ public class TikaInputStream extends Pro
         return file;
     }
 
+    /**
+     * Returns the length (in bytes) of this stream. Note that if the length
+     * was not available when this stream was instantiated, then this method
+     * will use the {@link #getFile()} method to buffer the entire stream to
+     * a temporary file in order to calculate the stream length. This case
+     * will only work if the stream has not yet been consumed.
+     *
+     * @return stream length
+     * @throws IOException if the length can not be determined
+     */
     public long getLength() throws IOException {
         if (length == -1) {
             length = getFile().length();

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java (original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java Wed Apr 28 09:12:22 2010
@@ -19,13 +19,13 @@ package org.apache.tika.metadata;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.net.URLConnection;
 
 import org.apache.tika.io.TikaInputStream;
 
 /**
  * Collection of static helper methods for handling metadata.
  *
+ * @deprecated Use {@link TikaInputStream} instead
  * @since Apache Tika 0.7
  */
 public class MetadataHelper {
@@ -47,30 +47,7 @@ public class MetadataHelper {
      */
     public static InputStream getInputStream(URL url, Metadata metadata)
             throws IOException {
-        URLConnection connection = url.openConnection();
-
-        String path = url.getPath();
-        int slash = path.lastIndexOf('/');
-        if (slash + 1 < path.length()) { // works even with -1!
-            metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 1));
-        }
-
-        String type = connection.getContentType();
-        if (type != null) {
-            metadata.set(Metadata.CONTENT_TYPE, type);
-        }
-
-        String encoding = connection.getContentEncoding();
-        if (encoding != null) {
-            metadata.set(Metadata.CONTENT_TYPE, encoding);
-        }
-
-        int length = connection.getContentLength();
-        if (length >= 0) {
-            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
-        }
-
-        return new TikaInputStream(connection.getInputStream());
+        return TikaInputStream.get(url, metadata);
     }
 
 }

Modified: lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java?rev=938830&r1=938829&r2=938830&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java (original)
+++ lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java Wed Apr 28 09:12:22 2010
@@ -30,7 +30,7 @@ public class TikaInputStreamTest extends
 
     public void testFileBased() throws IOException {
         File file = createTempFile("Hello, World!");
-        InputStream stream = new TikaInputStream(file);
+        InputStream stream = TikaInputStream.get(file);
 
         assertEquals(
                 "The file returned by the getFile() method should"
@@ -54,7 +54,7 @@ public class TikaInputStreamTest extends
     public void testStreamBased() throws IOException {
         InputStream input =
             new ByteArrayInputStream("Hello, World!".getBytes("UTF-8"));
-        InputStream stream = new TikaInputStream(input);
+        InputStream stream = TikaInputStream.get(input);
 
         File file = TikaInputStream.get(stream).getFile();
         assertTrue(file != null && file.isFile());