You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/08 19:15:29 UTC
[tika] branch main updated: TIKA-4032 -- look for embedded file name in eml in content-type param also.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new bca173bd7 TIKA-4032 -- look for embedded file name in eml in content-type param also.
new 06435d152 Merge remote-tracking branch 'origin/main'
bca173bd7 is described below
commit bca173bd72aa40d5be0fd4164200779e9708885e
Author: tballison <ta...@apache.org>
AuthorDate: Mon May 8 15:15:13 2023 -0400
TIKA-4032 -- look for embedded file name in eml in content-type param also.
---
.../org/apache/tika/parser/mail/MailContentHandler.java | 15 ++++++++++++++-
1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 29ee3025b..fc1482fe5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -66,6 +66,7 @@ import org.apache.tika.parser.txt.TXTParser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Bridge between mime4j's content handler and the generic Sax content handler
@@ -177,8 +178,20 @@ class MailContentHandler implements ContentHandler {
//do anything with "size"?
}
+ //the embedded file name can be in the content disposition field
+ //or a parameter on the content type field as in:
+ // Content-Type: application/pdf; name=blah.pdf
+ //Or it can be in both
+ //not sure we need this defensive null check?
+ if (body.getContentTypeParameters() != null) {
+ String contentTypeName = body.getContentTypeParameters().get("name");
+ if (!StringUtils.isBlank(contentTypeName)) {
+ submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentTypeName);
+ }
+ }
String contentDispositionFileName = body.getContentDispositionFilename();
- if (contentDispositionFileName != null) {
+ if (!StringUtils.isBlank(contentDispositionFileName)) {
+ //prefer the content disposition file name over the "name" param in the content-type
submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentDispositionFileName);
}
submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());