You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/01 22:38:43 UTC
[tika] branch main updated: TIKA-3903 -- try to maintain file's extension in temporary files to help with detection. (#769)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 533ea871f TIKA-3903 -- try to maintain file's extension in temporary files to help with detection. (#769)
533ea871f is described below
commit 533ea871f7384c35576365f6dcd0d7f9640f8fdd
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Nov 1 18:38:37 2022 -0400
TIKA-3903 -- try to maintain file's extension in temporary files to help with detection. (#769)
---
.../apache/tika/detect/FileCommandDetector.java | 2 +-
.../tika/extractor/ParserContainerExtractor.java | 2 +-
.../ParsingEmbeddedDocumentExtractor.java | 3 +-
.../java/org/apache/tika/io/FilenameUtils.java | 21 ++++++++-
.../org/apache/tika/io/TemporaryResources.java | 39 +++++++++++++----
.../java/org/apache/tika/io/TikaInputStream.java | 50 +++++++++++++++++-----
.../org/apache/tika/parser/AutoDetectParser.java | 2 +-
.../org/apache/tika/parser/CompositeParser.java | 2 +-
.../org/apache/tika/parser/DigestingParser.java | 2 +-
.../java/org/apache/tika/parser/NetworkParser.java | 2 +-
.../apache/tika/parser/RecursiveParserWrapper.java | 2 +-
.../tika/parser/digest/CompositeDigester.java | 2 +-
.../tika/parser/digest/InputStreamDigester.java | 2 +-
.../tika/parser/external/ExternalParser.java | 2 +-
.../tika/parser/external2/ExternalParser.java | 2 +-
.../parser/multiple/AbstractMultipleParser.java | 2 +-
.../java/org/apache/tika/utils/ParserUtils.java | 7 +--
.../src/test/java/org/apache/tika/TikaTest.java | 5 ++-
.../org/apache/tika/parser/gdal/GDALParser.java | 2 +-
.../geoinfo/GeographicInformationParser.java | 2 +-
.../apache/tika/parser/isatab/ISArchiveParser.java | 2 +-
.../apache/tika/parser/netcdf/NetCDFParser.java | 2 +-
.../tika/parser/pot/PooledTimeSeriesParser.java | 2 +-
.../apache/tika/parser/journal/JournalParser.java | 2 +-
.../java/org/apache/tika/parser/mp4/MP4Parser.java | 2 +-
.../java/org/apache/tika/parser/mat/MatParser.java | 2 +-
.../org/apache/tika/parser/html/HtmlParser.java | 2 +-
.../tika/parser/image/AbstractImageParser.java | 2 +-
.../org/apache/tika/parser/image/JpegParser.java | 2 +-
.../org/apache/tika/parser/image/TiffParser.java | 2 +-
.../org/apache/tika/parser/image/WebPParser.java | 2 +-
.../org/apache/tika/parser/epub/EpubParser.java | 6 ++-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 2 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 1 +
.../tika/renderer/pdf/mutool/MuPDFRenderer.java | 2 +-
.../org/apache/tika/parser/pkg/PackageParser.java | 4 +-
.../java/org/apache/tika/parser/pkg/RarParser.java | 2 +-
.../apache/tika/parser/strings/StringsParser.java | 2 +-
.../tika/pipes/fetcher/http/HttpFetcher.java | 5 ++-
.../tika/pipes/fetcher/http/HttpFetcherTest.java | 2 +-
.../apache/tika/pipes/fetcher/s3/S3Fetcher.java | 3 +-
41 files changed, 142 insertions(+), 62 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index e3e7b10f9..42349faec 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -108,7 +108,7 @@ public class FileCommandDetector implements Detector {
input.mark(maxBytes);
try (TemporaryResources tmp = new TemporaryResources()) {
- Path tmpFile = tmp.createTempFile();
+ Path tmpFile = tmp.createTempFile(metadata);
Files.copy(new BoundedInputStream(maxBytes, input), tmpFile, REPLACE_EXISTING);
return detectOnPath(tmpFile, metadata);
} finally {
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
index 50d29f8d3..b2e9cd169 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
@@ -111,7 +111,7 @@ public class ParserContainerExtractor implements ContainerExtractor {
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
// Figure out what we have to process
String filename = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index bdfd028f5..8391624a3 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -98,7 +98,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
- final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
+ final TikaInputStream newStream =
+ TikaInputStream.get(new CloseShieldInputStream(stream), tmp, metadata);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 42a9b4bbe..17bc9e920 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -19,6 +19,8 @@ package org.apache.tika.io;
import java.util.HashSet;
import java.util.Locale;
+import org.apache.tika.utils.StringUtils;
+
public class FilenameUtils {
@@ -93,7 +95,7 @@ public class FilenameUtils {
public static String getName(final String path) {
if (path == null || path.length() == 0) {
- return "";
+ return StringUtils.EMPTY;
}
int unix = path.lastIndexOf("/");
int windows = path.lastIndexOf("\\");
@@ -102,8 +104,23 @@ public class FilenameUtils {
int colon = path.lastIndexOf(":");
String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
if (cand.equals("..") || cand.equals(".")) {
- return "";
+ return StringUtils.EMPTY;
}
return cand;
}
+
+ /**
+ * This includes the period, e.g. ".pdf"
+ * @param path
+ * @return the suffix or an empty string if one could not be found
+ */
+ public static String getSuffixFromPath(String path) {
+ String n = getName(path);
+ int i = n.lastIndexOf(".");
+ //arbitrarily sets max extension length
+ if (i > -1 && n.length() - i < 6) {
+ return n.substring(i);
+ }
+ return StringUtils.EMPTY;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
index c32308361..c1565ab86 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
@@ -27,6 +27,9 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.utils.StringUtils;
/**
* Utility class for tracking and ultimately closing or otherwise disposing
@@ -52,7 +55,7 @@ public class TemporaryResources implements Closeable {
/**
* Sets the directory to be used for the temporary files created by
- * the {@link #createTempFile()} method.
+ * the {@link #createTempFile(String)} method.
*
* @param tempFileDir temporary file directory,
* or <code>null</code> for the system default
@@ -63,7 +66,7 @@ public class TemporaryResources implements Closeable {
/**
* Sets the directory to be used for the temporary files created by
- * the {@link #createTempFile()} method.
+ * the {@link #createTempFile(String)} method.
*
* @param tempFileDir temporary file directory,
* or <code>null</code> for the system default
@@ -76,13 +79,15 @@ public class TemporaryResources implements Closeable {
/**
* Creates a temporary file that will automatically be deleted when
* the {@link #close()} method is called, returning its path.
- *
+ * @param suffix -- the suffix of the file if known, starting with "." as in ".pdf"
* @return Path to created temporary file that will be deleted after closing
* @throws IOException
*/
- public Path createTempFile() throws IOException {
- final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", ".tmp") :
- Files.createTempFile(tempFileDir, "apache-tika-", ".tmp");
+ public Path createTempFile(String suffix) throws IOException {
+ String actualSuffix = StringUtils.isBlank(suffix) ? ".tmp" : suffix;
+
+ final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", actualSuffix) :
+ Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix);
addResource(() -> {
try {
Files.delete(path);
@@ -95,16 +100,34 @@ public class TemporaryResources implements Closeable {
return path;
}
+ public Path createTempFile() throws IOException {
+ return createTempFile(StringUtils.EMPTY);
+ }
+
+ /**
+ * Creates a temporary file that will automatically be deleted when
+ * the {@link #close()} method is called, returning its path.
+ *
+ * @return Path to created temporary file that will be deleted after closing
+ * @throws IOException
+ */
+ public Path createTempFile(Metadata metadata) throws IOException {
+ String resourceName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ if (StringUtils.isBlank(resourceName)) {
+ return createTempFile(StringUtils.EMPTY);
+ }
+ return createTempFile(FilenameUtils.getSuffixFromPath(resourceName));
+ }
/**
* Creates and returns a temporary file that will automatically be
* deleted when the {@link #close()} method is called.
*
* @return Created temporary file that'll be deleted after closing
* @throws IOException
- * @see #createTempFile()
+ * @see #createTempFile(String)
*/
public File createTemporaryFile() throws IOException {
- return createTempFile().toFile();
+ return createTempFile(StringUtils.EMPTY).toFile();
}
/**
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index aef9021b3..76db4a2a3 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -119,6 +119,10 @@ public class TikaInputStream extends TaggedInputStream {
private int consecutiveEOFs = 0;
private byte[] skipBuffer;
+ //suffix of the file if known. This is used to create temp files
+ //with the right suffixes. This should include the initial . as in ".doc"
+ private String suffix = null;
+
/**
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
@@ -131,6 +135,7 @@ public class TikaInputStream extends TaggedInputStream {
this.path = path;
this.tmp = new TemporaryResources();
this.length = Files.size(path);
+ this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());
}
private TikaInputStream(Path path, TemporaryResources tmp, long length) throws IOException {
@@ -138,6 +143,7 @@ public class TikaInputStream extends TaggedInputStream {
this.path = path;
this.tmp = tmp;
this.length = length;
+ this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());
}
/**
@@ -154,6 +160,8 @@ public class TikaInputStream extends TaggedInputStream {
this.path = file.toPath();
this.tmp = new TemporaryResources();
this.length = file.length();
+ this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());
+
}
/**
@@ -168,11 +176,12 @@ public class TikaInputStream extends TaggedInputStream {
* @param tmp tracker for temporary resources associated with this stream
* @param length total length of the stream, or -1 if unknown
*/
- private TikaInputStream(InputStream stream, TemporaryResources tmp, long length) {
+ private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, String suffix) {
super(stream);
this.path = null;
this.tmp = tmp;
this.length = length;
+ this.suffix = suffix;
}
/**
@@ -215,7 +224,7 @@ public class TikaInputStream extends TaggedInputStream {
* @return a TikaInputStream instance
* @since Apache Tika 0.10
*/
- public static TikaInputStream get(InputStream stream, TemporaryResources tmp) {
+ public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Metadata metadata) {
if (stream == null) {
throw new NullPointerException("The Stream must not be null");
}
@@ -227,17 +236,28 @@ public class TikaInputStream extends TaggedInputStream {
if (!(stream.markSupported())) {
stream = new BufferedInputStream(stream);
}
- return new TikaInputStream(stream, tmp, -1);
+ return new TikaInputStream(stream, tmp, -1, getExtension(metadata));
}
}
+ /**
+ * @deprecated use {@link TikaInputStream#get(InputStream, TemporaryResources, Metadata)}
+ * @param stream
+ * @param tmp
+ * @return
+ */
+ @Deprecated
+ public static TikaInputStream get(InputStream stream, TemporaryResources tmp) {
+ return get(stream, tmp, null);
+ }
+
/**
* Casts or wraps the given stream to a TikaInputStream instance.
* This method can be used to access the functionality of this class
* even when given just a normal input stream instance.
* <p>
* Use this method instead of the
- * {@link #get(InputStream, TemporaryResources)} alternative when you
+ * {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you
* <em>do</em> explicitly close the returned stream. The recommended
* access pattern is:
* <pre>
@@ -254,7 +274,7 @@ public class TikaInputStream extends TaggedInputStream {
* @return a TikaInputStream instance
*/
public static TikaInputStream get(InputStream stream) {
- return get(stream, new TemporaryResources());
+ return get(stream, new TemporaryResources(), null);
}
/**
@@ -301,7 +321,7 @@ public class TikaInputStream extends TaggedInputStream {
public static TikaInputStream get(byte[] data, Metadata metadata) {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
return new TikaInputStream(new UnsynchronizedByteArrayInputStream(data), new TemporaryResources(),
- data.length);
+ data.length, getExtension(metadata));
}
/**
@@ -414,7 +434,7 @@ public class TikaInputStream extends TaggedInputStream {
*/
public static TikaInputStream get(InputStreamFactory factory, TemporaryResources tmp)
throws IOException {
- TikaInputStream stream = get(factory.getInputStream(), tmp);
+ TikaInputStream stream = get(factory.getInputStream(), tmp, null);
stream.streamFactory = factory;
return stream;
}
@@ -451,6 +471,7 @@ public class TikaInputStream extends TaggedInputStream {
* @throws SQLException if BLOB data can not be accessed
*/
public static TikaInputStream get(Blob blob, Metadata metadata) throws SQLException {
+
long length = -1;
try {
length = blob.length();
@@ -465,8 +486,17 @@ public class TikaInputStream extends TaggedInputStream {
return get(blob.getBytes(1, (int) length), metadata);
} else {
return new TikaInputStream(new BufferedInputStream(blob.getBinaryStream()),
- new TemporaryResources(), length);
+ new TemporaryResources(), length,
+ getExtension(metadata));
+ }
+ }
+
+ private static String getExtension(Metadata metadata) {
+ if (metadata == null) {
+ return StringUtils.EMPTY;
}
+ String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ return FilenameUtils.getSuffixFromPath(name);
}
/**
@@ -570,7 +600,7 @@ public class TikaInputStream extends TaggedInputStream {
}
return new TikaInputStream(new BufferedInputStream(connection.getInputStream()),
- new TemporaryResources(), length);
+ new TemporaryResources(), length, getExtension(metadata));
}
/**
@@ -678,7 +708,7 @@ public class TikaInputStream extends TaggedInputStream {
if (position > 0) {
throw new IOException("Stream is already being read");
} else {
- Path tmpFile = tmp.createTempFile();
+ Path tmpFile = tmp.createTempFile(suffix);
if (maxBytes > -1) {
try (InputStream lookAhead = new LookaheadInputStream(this, maxBytes)) {
Files.copy(lookAhead, tmpFile, REPLACE_EXISTING);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 0a6a5e6dc..4d870d771 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -141,7 +141,7 @@ public class AutoDetectParser extends CompositeParser {
}
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
//figure out if we should spool to disk
maybeSpool(tis, autoDetectParserConfig, metadata);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 73cffa72a..6c151e21c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -287,7 +287,7 @@ public class CompositeParser extends AbstractParser {
context.set(ParseRecord.class, parserRecord);
}
try {
- TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
+ TikaInputStream taggedStream = TikaInputStream.get(stream, tmp, metadata);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
String parserClassname = ParserUtils.getParserClassname(parser);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index a36785fdf..30736091a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -47,7 +47,7 @@ public class DigestingParser extends ParserDecorator {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
try {
if (digester != null) {
digester.digest(tis, metadata, context);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
index a1f1e63e0..3582a70f7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
@@ -67,7 +67,7 @@ public class NetworkParser extends AbstractParser {
ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
parse(tis, handler, metadata, context);
} finally {
tmp.dispose();
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index d660cc175..e9e9457bb 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -155,7 +155,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
}
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
RecursivelySecureContentHandler secureContentHandler =
new RecursivelySecureContentHandler(localHandler, tis, writeLimit,
throwOnWriteLimitReached, context);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
index 595f46f4a..ee4dfe233 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
@@ -39,7 +39,7 @@ public class CompositeDigester implements DigestingParser.Digester {
@Override
public void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException {
TemporaryResources tmp = new TemporaryResources();
- TikaInputStream tis = TikaInputStream.get(is, tmp);
+ TikaInputStream tis = TikaInputStream.get(is, tmp, m);
try {
for (DigestingParser.Digester digester : digesters) {
digester.digest(tis, m, parseContext);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index 76e3d9adf..22f89d82a 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -152,7 +152,7 @@ public class InputStreamDigester implements DigestingParser.Digester {
} else {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp);
+ TikaInputStream tmpTikaInputStream = TikaInputStream.get(is, tmp, metadata);
digestFile(tmpTikaInputStream.getFile(), metadata);
} finally {
try {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
index 52c7cdcab..6b607991d 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
@@ -271,7 +271,7 @@ public class ExternalParser extends AbstractParser {
TemporaryResources tmp = new TemporaryResources();
try {
- parse(TikaInputStream.get(stream, tmp), xhtml, metadata, tmp);
+ parse(TikaInputStream.get(stream, tmp, metadata), xhtml, metadata, tmp);
} finally {
tmp.dispose();
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
index ec53d29dd..3bb1825be 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
@@ -104,7 +104,7 @@ public class ExternalParser extends AbstractParser implements Initializable {
//this may remain null, depending on whether the external parser writes to a file
Path outFile = null;
try (TemporaryResources tmp = new TemporaryResources()) {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
Path p = tis.getPath();
List<String> thisCommandLine = new ArrayList<>();
Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher("");
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index e851d581d..d05305f9f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -270,7 +270,7 @@ public abstract class AbstractMultipleParser extends AbstractParser {
try {
// Ensure we'll be able to re-read safely, buffering to disk if so,
// to permit Parsers 2+ to be able to read the same data
- InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp);
+ InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata);
for (Parser p : parsers) {
// Get a new handler for this parser, if we can
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 0ad53abd4..837f76295 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -113,7 +113,8 @@ public class ParserUtils {
* Streams that are automatically OK include {@link TikaInputStream}s
* created from Files or InputStreamFactories, and {@link RereadableInputStream}.
*/
- public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp)
+ public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp,
+ Metadata metadata)
throws IOException {
// If it's re-readable, we're done
if (stream instanceof RereadableInputStream) {
@@ -123,7 +124,7 @@ public class ParserUtils {
// Make sure it's a TikaInputStream
TikaInputStream tstream = TikaInputStream.cast(stream);
if (tstream == null) {
- tstream = TikaInputStream.get(stream, tmp);
+ tstream = TikaInputStream.get(stream, tmp, metadata);
}
// If it's factory based, it's ok
@@ -140,7 +141,7 @@ public class ParserUtils {
/**
* Resets the given {@link TikaInputStream} (checked by
- * {@link #ensureStreamReReadable(InputStream, TemporaryResources)})
+ * {@link #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)})
* so that it can be re-read again.
*/
public static InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 7c616b0fa..fa112ca4c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -44,6 +44,7 @@ import org.xml.sax.ContentHandler;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedResourceHandler;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -433,8 +434,10 @@ public abstract class TikaTest {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1));
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath));
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
- wrapper.parse(is, handler, new Metadata(), context);
+ wrapper.parse(is, handler, metadata, context);
}
return handler.getMetadataList();
}
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
index 4fc9c2d66..468087c32 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java
@@ -182,7 +182,7 @@ public class GDALParser extends AbstractParser {
// first set up and run GDAL
// process the command
TemporaryResources tmp = new TemporaryResources();
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
String runCommand = processCommand(tis);
String output = execCommand(new String[]{runCommand});
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java
index 532a33613..e5732eb75 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java
@@ -97,7 +97,7 @@ public class GeographicInformationParser extends AbstractParser {
TemporaryResources tmp =
TikaInputStream.isTikaInputStream(inputStream) ? null : new TemporaryResources();
try {
- TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, tmp);
+ TikaInputStream tikaInputStream = TikaInputStream.get(inputStream, tmp, metadata);
File file = tikaInputStream.getFile();
dataStore = DataStores.open(file);
defaultMetadata = new DefaultMetadata(dataStore.getMetadata());
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
index 01e4187da..7fd6f39c0 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
@@ -78,7 +78,7 @@ public class ISArchiveParser implements Parser {
TemporaryResources tmp =
TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
try {
if (this.location == null) {
this.location = tis.getFile().getParent() + File.separator;
diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
index 9dbb613af..7f6f5a77f 100644
--- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
+++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
@@ -83,7 +83,7 @@ public class NetCDFParser extends AbstractParser {
TemporaryResources tmp =
TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
try (NetcdfFile ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath())) {
metadata.set("File-Type-Description", ncFile.getFileTypeDescription());
// first parse out the set of global attributes
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java b/tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
index d86959368..cfe37dbbb 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
+++ b/tika-parsers/tika-parsers-ml/tika-parser-advancedmedia-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
@@ -125,7 +125,7 @@ public class PooledTimeSeriesParser extends AbstractParser {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp, metadata);
File input = tikaStream.getFile();
String cmdOutput = computePoT(input);
try (InputStream ofStream = new FileInputStream(
diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java
index a81f17dfa..cc01e4969 100644
--- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java
+++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/journal/JournalParser.java
@@ -53,7 +53,7 @@ public class JournalParser extends AbstractParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
- TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources(), metadata);
File tmpFile = tis.getFile();
GrobidRESTParser grobidParser = new GrobidRESTParser();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
index d49941f15..67f74d218 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/mp4/MP4Parser.java
@@ -96,7 +96,7 @@ public class MP4Parser extends AbstractParser {
ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
- TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+ TikaInputStream tstream = TikaInputStream.get(stream, tmp, metadata);
try (InputStream is = Files.newInputStream(tstream.getPath())) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
index fef6959f6..dff494787 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
@@ -70,7 +70,7 @@ public class MatParser extends AbstractParser {
TikaInputStream.isTikaInputStream(stream) ? null : new TemporaryResources();
try {
// Use TIS so we can spool a temp file for parsing.
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
//Extract information from header file
MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index 8a25e9047..d72f4865e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -91,7 +91,7 @@ public class HtmlParser extends AbstractEncodingDetectorParser {
try {
if (!TikaInputStream.isTikaInputStream(stream)) {
tmp = new TemporaryResources();
- stream = TikaInputStream.get(stream, tmp);
+ stream = TikaInputStream.get(stream, tmp, metadata);
}
//AutoDetectReader can throw exceptions during
//initialization. If we just created a
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
index d58fca01b..c07e05e49 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/AbstractImageParser.java
@@ -84,7 +84,7 @@ public abstract class AbstractImageParser extends AbstractParser {
}
TemporaryResources tmpResources = new TemporaryResources();
- TikaInputStream tis = TikaInputStream.get(stream, tmpResources);
+ TikaInputStream tis = TikaInputStream.get(stream, tmpResources, metadata);
Exception metadataException = null;
try {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/JpegParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/JpegParser.java
index 3b4e7ee8a..c64cd98af 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/JpegParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/JpegParser.java
@@ -52,7 +52,7 @@ public class JpegParser extends AbstractImageParser {
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
new ImageMetadataExtractor(metadata).parseJpeg(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/TiffParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
index 2e3205cfa..89e507cc1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/TiffParser.java
@@ -52,7 +52,7 @@ public class TiffParser extends AbstractImageParser {
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
new ImageMetadataExtractor(metadata).parseTiff(tis.getFile());
new JempboxExtractor(metadata).parse(tis);
} finally {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/WebPParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/WebPParser.java
index 810f5f463..98f7422f4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/WebPParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/WebPParser.java
@@ -52,7 +52,7 @@ public class WebPParser extends AbstractParser {
ParseContext context) throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
new ImageMetadataExtractor(metadata).parseWebP(tis.getFile());
} finally {
tmp.dispose();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index a20191da9..816030e37 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -49,6 +49,7 @@ import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -175,7 +176,7 @@ public class EpubParser extends AbstractParser {
}
} else {
temporaryResources = new TemporaryResources();
- tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources);
+ tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources, metadata);
}
ZipFile zipFile = null;
try {
@@ -201,7 +202,8 @@ public class EpubParser extends AbstractParser {
Metadata metadata, ParseContext context)
throws IOException, TikaException, SAXException {
try (TemporaryResources resources = new TemporaryResources()) {
- Path salvaged = resources.createTempFile();
+ Path salvaged =
+ resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString()));
ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
boolean success = false;
try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index f0b21420a..a79e05b1d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -247,7 +247,7 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements
try (TemporaryResources tmp = new TemporaryResources()) {
- TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp, metadata);
//trigger the spooling to a tmp file if the stream wasn't
//already a TikaInputStream that contained a file
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index c9cfc94c7..4f2b2c864 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -651,6 +651,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
BufferedImage image =
renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+ //TODO -- get suffix based on OcrImageType
tmpFile = tmpResources.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
index d108ba54b..c3d587ef7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
@@ -61,7 +61,7 @@ public class MuPDFRenderer implements Renderer {
RenderRequest... requests) throws IOException, TikaException {
TemporaryResources tmp = new TemporaryResources();
PageBasedRenderResults results = new PageBasedRenderResults(tmp);
- Path path = TikaInputStream.get(is, tmp).getPath();
+ Path path = TikaInputStream.get(is, tmp, metadata).getPath();
for (RenderRequest request : requests) {
renderRequest(path, metadata, parseContext, request, results, tmp);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 7fedfd3ba..9a416340d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -267,7 +267,7 @@ public class PackageParser extends AbstractEncodingDetectorParser {
if (sne.getFormat().equals(ArchiveStreamFactory.SEVEN_Z)) {
// Rework as a file, and wrap
stream.reset();
- TikaInputStream tstream = TikaInputStream.get(stream, tmp);
+ TikaInputStream tstream = TikaInputStream.get(stream, tmp, metadata);
// Seven Zip suports passwords, was one given?
String password = null;
@@ -445,7 +445,7 @@ public class PackageParser extends AbstractEncodingDetectorParser {
// InputStream, which ArchiveInputStream isn't, so wrap
TemporaryResources tmp = new TemporaryResources();
try {
- TikaInputStream tis = TikaInputStream.get(archive, tmp);
+ TikaInputStream tis = TikaInputStream.get(archive, tmp, entrydata);
extractor.parseEmbedded(tis, xhtml, entrydata, true);
} finally {
tmp.dispose();
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
index 770f2bc6c..ac9fbe21c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/RarParser.java
@@ -70,7 +70,7 @@ public class RarParser extends AbstractParser {
}
Archive rar = null;
try (TemporaryResources tmp = new TemporaryResources()) {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
rar = new Archive(tis.getFile());
if (rar.isEncrypted()) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
index cb8f045ff..3be4f32dc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java
@@ -99,7 +99,7 @@ public class StringsParser extends AbstractParser implements Initializable {
StringsConfig stringsConfig = context.get(StringsConfig.class, defaultStringsConfig);
try (TemporaryResources tmp = new TemporaryResources()) {
- TikaInputStream tis = TikaInputStream.get(stream, tmp);
+ TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
File input = tis.getFile();
// Metadata
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
index 6d555e3aa..35e6f3e82 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/main/java/org/apache/tika/pipes/fetcher/http/HttpFetcher.java
@@ -66,6 +66,7 @@ import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.fetcher.AbstractFetcher;
import org.apache.tika.pipes.fetcher.RangeFetcher;
import org.apache.tika.utils.StringUtils;
@@ -228,7 +229,7 @@ public class HttpFetcher extends AbstractFetcher implements Initializable, Range
private InputStream spool(InputStream content, Metadata metadata) throws IOException {
long start = System.currentTimeMillis();
TemporaryResources tmp = new TemporaryResources();
- Path tmpFile = tmp.createTempFile();
+ Path tmpFile = tmp.createTempFile(metadata);
if (maxSpoolSize < 0) {
Files.copy(content, tmpFile, StandardCopyOption.REPLACE_EXISTING);
} else {
@@ -285,6 +286,7 @@ public class HttpFetcher extends AbstractFetcher implements Initializable, Range
if (uri != null) {
URL u = uri.toURL();
metadata.set(HTTP_TARGET_URL, u.toString());
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, u.getFile());
}
} catch (MalformedURLException e) {
//swallow
@@ -302,7 +304,6 @@ public class HttpFetcher extends AbstractFetcher implements Initializable, Range
url);
}
}
-
}
private String responseToString(HttpResponse response) {
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
index 3926710de..e26e6cfcb 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-http/src/test/java/org/apache/tika/pipes/fetcher/http/HttpFetcherTest.java
@@ -123,7 +123,7 @@ public class HttpFetcherTest extends TikaTest {
HttpFetcher httpFetcher =
(HttpFetcher) getFetcherManager("tika-config-http.xml").getFetcher("http");
try (TemporaryResources tmp = new TemporaryResources()) {
- Path tmpPath = tmp.createTempFile();
+ Path tmpPath = tmp.createTempFile(metadata);
try (InputStream is = httpFetcher.fetch(url, start, end, metadata)) {
Files.copy(new GZIPInputStream(is), tmpPath, StandardCopyOption.REPLACE_EXISTING);
}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java
index 95760e145..12774fda0 100644
--- a/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java
+++ b/tika-pipes/tika-fetchers/tika-fetcher-s3/src/main/java/org/apache/tika/pipes/fetcher/s3/S3Fetcher.java
@@ -48,6 +48,7 @@ import org.apache.tika.config.Param;
import org.apache.tika.exception.FileTooLongException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -169,7 +170,7 @@ public class S3Fetcher extends AbstractFetcher implements Initializable, RangeFe
} else {
start = System.currentTimeMillis();
tmp = new TemporaryResources();
- Path tmpPath = tmp.createTempFile();
+ Path tmpPath = tmp.createTempFile(FilenameUtils.getSuffixFromPath(fetchKey));
Files.copy(s3Object.getObjectContent(), tmpPath, StandardCopyOption.REPLACE_EXISTING);
TikaInputStream tis = TikaInputStream.get(tmpPath, metadata, tmp);
LOGGER.debug("took {} ms to fetch metadata and copy to local tmp file",