You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/21 08:40:31 UTC

[tika] 02/02: Simplify stream resetting logic by using new ParserUtil methods for it

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 54477aa3904944ca964e1f70188ac69b9f994042
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 21 08:40:23 2018 +0000

    Simplify stream resetting logic by using new ParserUtil methods for it
---
 .../tika/parser/multiple/AbstractMultipleParser.java   | 18 ++++--------------
 .../main/java/org/apache/tika/utils/ParserUtils.java   |  4 ++--
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 8f896b2..9fcf998 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -30,7 +30,6 @@ import java.util.Set;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
@@ -215,20 +214,11 @@ public abstract class AbstractMultipleParser extends AbstractParser {
         // Start tracking resources, so we can clean up when done
         TemporaryResources tmp = new TemporaryResources();
         try {
-            // Force the stream to be a Tika one
-            // Force the stream to be file-backed, so we can re-read safely
-            //  later if required for parser 2+
-            // TODO Should we support RereadableInputStream as well?
-            // TODO Can we put this re-read logic in a utils method?
-            TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
-            taggedStream.getPath();
+            // Ensure we'll be able to re-read safely, buffering to disk if so,
+            //  to permit Parsers 2+ to be able to read the same data
+            InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp);
             
             for (Parser p : parsers) {
-                // Indicate we may need to re-read the stream later
-                // TODO Support an InputStreamFactory as an alternative to
-                //  Files, see TIKA-2585
-                taggedStream.mark(-1);
-                
                 // Get a new handler for this parser, if we can
                 // If not, the user will get text from every parser
                 //  mushed together onto the one solitary handler...
@@ -276,7 +266,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
                 
                 // Prepare for the next parser, if present
                 lastMetadata = cloneMetadata(metadata);
-                taggedStream.reset();
+                taggedStream = ParserUtils.streamResetForReRead(taggedStream, tmp);
             }
         } finally {
             tmp.dispose();
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 663b689..02958c2 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -96,7 +96,7 @@ public class ParserUtils {
      * Streams that are automatically OK include {@link TikaInputStream}s
      *  created from Files or InputStreamFactories, and {@link RereadableInputStream}.
      */
-    public InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp) throws IOException {
+    public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp) throws IOException {
         // If it's re-readable, we're done
         if (stream instanceof RereadableInputStream) return stream;
 
@@ -120,7 +120,7 @@ public class ParserUtils {
      *  {@link #ensureStreamReReadable(InputStream, TemporaryResources)})
      * so that it can be re-read again.
      */
-    public InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) throws IOException {
+    public static InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) throws IOException {
         // If re-readable, rewind to start
         if (stream instanceof RereadableInputStream) {
             ((RereadableInputStream)stream).rewind();

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.