You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/21 08:40:30 UTC

[tika] 01/02: ParserUtils methods for handling the reset/re-read of the stream

This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5bd8b281c932d39aac4b6d7babe0732f853ed2ae
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 21 08:37:17 2018 +0000

    ParserUtils methods for handling the reset/re-read of the stream
---
 .../java/org/apache/tika/utils/ParserUtils.java    | 55 ++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 4005217..663b689 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -16,6 +16,11 @@
  */
 package org.apache.tika.utils;
 
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -84,4 +89,54 @@ public class ParserUtils {
         metadata.add(EMBEDDED_EXCEPTION, trace);
         metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
     }
+
+    /**
+     * Ensures that the Stream will be able to be re-read, by buffering to
+     *  a temporary file if required.
+     * Streams that are automatically OK include {@link TikaInputStream}s
+     *  created from Files or InputStreamFactories, and {@link RereadableInputStream}.
+     */
+    public InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp) throws IOException {
+        // If it's re-readable, we're done
+        if (stream instanceof RereadableInputStream) return stream;
+
+        // Make sure it's a TikaInputStream
+        TikaInputStream tstream = TikaInputStream.cast(stream);
+        if (tstream == null) {
+            tstream = TikaInputStream.get(stream, tmp);
+        }
+
+        // If it's factory based, it's ok
+        if (tstream.getInputStreamFactory() != null) return tstream;
+
+        // Ensure it's file based
+        tstream.getFile();
+        // Prepare for future re-reads
+        tstream.mark(-1);
+        return tstream;
+    }
+    /**
+     * Resets the given {@link TikaInputStream} (checked by 
+     *  {@link #ensureStreamReReadable(InputStream, TemporaryResources)})
+     * so that it can be re-read again.
+     */
+    public InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) throws IOException {
+        // If re-readable, rewind to start
+        if (stream instanceof RereadableInputStream) {
+            ((RereadableInputStream)stream).rewind();
+            return stream;
+        }
+
+        // File or Factory based?
+        TikaInputStream tstream = (TikaInputStream)stream;
+        if (tstream.getInputStreamFactory() != null) {
+            // Just get a fresh one each time from the factory
+            return TikaInputStream.get(tstream.getInputStreamFactory(), tmp);
+        }
+
+        // File based, reset stream to beginning of File
+        tstream.reset();
+        tstream.mark(-1);
+        return tstream;
+    }
 }

-- 
To stop receiving notification emails like this one, please contact
nick@apache.org.