You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2018/03/21 08:40:30 UTC
[tika] 01/02: ParserUtils methods for handling the reset/re-read of
the stream
This is an automated email from the ASF dual-hosted git repository.
nick pushed a commit to branch multiple-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5bd8b281c932d39aac4b6d7babe0732f853ed2ae
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Wed Mar 21 08:37:17 2018 +0000
ParserUtils methods for handling the reset/re-read of the stream
---
.../java/org/apache/tika/utils/ParserUtils.java | 55 ++++++++++++++++++++++
1 file changed, 55 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 4005217..663b689 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -16,6 +16,11 @@
*/
package org.apache.tika.utils;
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -84,4 +89,54 @@ public class ParserUtils {
metadata.add(EMBEDDED_EXCEPTION, trace);
metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
}
+
+ /**
+ * Ensures that the Stream will be able to be re-read, by buffering to
+ * a temporary file if required.
+ * Streams that are automatically OK include {@link TikaInputStream}s
+ * created from Files or InputStreamFactories, and {@link RereadableInputStream}.
+ */
+ public InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp) throws IOException {
+ // If it's re-readable, we're done
+ if (stream instanceof RereadableInputStream) return stream;
+
+ // Make sure it's a TikaInputStream
+ TikaInputStream tstream = TikaInputStream.cast(stream);
+ if (tstream == null) {
+ tstream = TikaInputStream.get(stream, tmp);
+ }
+
+ // If it's factory based, it's ok
+ if (tstream.getInputStreamFactory() != null) return tstream;
+
+ // Ensure it's file based
+ tstream.getFile();
+ // Prepare for future re-reads
+ tstream.mark(-1);
+ return tstream;
+ }
+ /**
+ * Resets the given {@link TikaInputStream} (checked by
+ * {@link #ensureStreamReReadable(InputStream, TemporaryResources)})
+ * so that it can be re-read again.
+ */
+ public InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) throws IOException {
+ // If re-readable, rewind to start
+ if (stream instanceof RereadableInputStream) {
+ ((RereadableInputStream)stream).rewind();
+ return stream;
+ }
+
+ // File or Factory based?
+ TikaInputStream tstream = (TikaInputStream)stream;
+ if (tstream.getInputStreamFactory() != null) {
+ // Just get a fresh one each time from the factory
+ return TikaInputStream.get(tstream.getInputStreamFactory(), tmp);
+ }
+
+ // File based, reset stream to beginning of File
+ tstream.reset();
+ tstream.mark(-1);
+ return tstream;
+ }
}
--
To stop receiving notification emails like this one, please contact
nick@apache.org.