You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/05/08 19:15:29 UTC

[tika] branch main updated: TIKA-4032 -- look for embedded file name in eml in content-type param also.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new bca173bd7 TIKA-4032 -- look for embedded file name in eml in content-type param also.
     new 06435d152 Merge remote-tracking branch 'origin/main'
bca173bd7 is described below

commit bca173bd72aa40d5be0fd4164200779e9708885e
Author: tballison <ta...@apache.org>
AuthorDate: Mon May 8 15:15:13 2023 -0400

    TIKA-4032 -- look for embedded file name in eml in content-type param also.
---
 .../org/apache/tika/parser/mail/MailContentHandler.java   | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
index 29ee3025b..fc1482fe5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/MailContentHandler.java
@@ -66,6 +66,7 @@ import org.apache.tika.parser.txt.TXTParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Bridge between mime4j's content handler and the generic Sax content handler
@@ -177,8 +178,20 @@ class MailContentHandler implements ContentHandler {
                 //do anything with "size"?
             }
 
+            //the embedded file name can be in the content disposition field
+            //or a parameter on the content type field as in:
+            // Content-Type: application/pdf; name=blah.pdf
+            //Or it can be in both
+            //not sure we need this defensive null check?
+            if (body.getContentTypeParameters() != null) {
+                String contentTypeName = body.getContentTypeParameters().get("name");
+                if (!StringUtils.isBlank(contentTypeName)) {
+                    submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentTypeName);
+                }
+            }
             String contentDispositionFileName = body.getContentDispositionFilename();
-            if (contentDispositionFileName != null) {
+            if (!StringUtils.isBlank(contentDispositionFileName)) {
+                //prefer the content disposition file name over the "name" param in the content-type
                 submd.set(TikaCoreProperties.RESOURCE_NAME_KEY, contentDispositionFileName);
             }
             submd.set(Metadata.CONTENT_DISPOSITION, contentDisposition.toString());