You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/01/30 16:20:15 UTC

[tika] branch main updated: TIKA-3962 - set rfc822 parser to no recurse

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new bff14f395 TIKA-3962 - set rfc822 parser to no recurse
bff14f395 is described below

commit bff14f39513d7624c04f0e8f0173099ac4d14699
Author: tballison <ta...@apache.org>
AuthorDate: Mon Jan 30 11:20:06 2023 -0500

    TIKA-3962 - set rfc822 parser to no recurse
---
 .../org/apache/tika/parser/mail/RFC822Parser.java  |  1 +
 .../apache/tika/parser/mail/RFC822ParserTest.java  | 14 +++++++++--
 .../resources/test-documents/testGroupWiseEml.eml  | 28 +++++++++++-----------
 3 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
index 335a63063..3717b13b5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/main/java/org/apache/tika/parser/mail/RFC822Parser.java
@@ -92,6 +92,7 @@ public class RFC822Parser extends AbstractParser {
                 config.isStrictParsing(), extractAllAlternatives);
         parser.setContentHandler(mch);
         parser.setContentDecoding(true);
+        parser.setNoRecurse();
         xhtml.startDocument();
         TikaInputStream tstream = TikaInputStream.get(stream);
         try {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
index 943e7c5db..f558a7ffe 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/java/org/apache/tika/parser/mail/RFC822ParserTest.java
@@ -558,10 +558,20 @@ public class RFC822ParserTest extends TikaTest {
     @Test
     public void testGroupwise() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testGroupWiseEml.eml");
-        assertEquals(2, metadataList.size());
-        assertContains("ssssss", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals(3, metadataList.size());
+        assertContains("test<", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
+        assertContains("test2", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
         assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
                 metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertEquals("/test.eml",
+                metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
+        assertContains("ssssss", metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+        assertEquals(TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString(),
+                metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+        assertEquals("/Neues Textdokument.txt",
+                metadataList.get(2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
+
     }
 
     @Test
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml
index a6d2398e1..845527164 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-mail-module/src/test/resources/test-documents/testGroupWiseEml.eml
@@ -35,20 +35,20 @@ Content-Type: message/rfc822
 Content-Transfer-Encoding: base64
 Content-Disposition: attachment; filename="test.eml"
 
-TWltZS1WZXJzaW9uOiAxLjANClgtTWFpbGVyOiBHcm91cFdpc2UgMjAxMg0KU3ViamVjdDogdGVz
-dA0KRGF0ZTogVGh1LCAyNyBKdW4gMjAxMyAxMzoyNzoxMiArMDIwMA0KTWVzc2FnZS1JRDogPDUx
-Q0MzREIwMDIwMDAwMDAwMDAwMDAwM0AkJCQ+DQpGcm9tOiAiTm92ZWxsIEdyb3VwV2lzZSIgPCQk
-JC4kJCQuJCQkPg0KQ29udGVudC1UeXBlOiBtdWx0aXBhcnQvYWx0ZXJuYXRpdmU7IGJvdW5kYXJ5
-PSJfX19fTFBITVhMWk1YT01STEZLU0VKQ1dfX19fIg0KDQoNCi0tX19fX0xQSE1YTFpNWE9NUkxG
-S1NFSkNXX19fXw0KQ29udGVudC1UeXBlOiB0ZXh0L3BsYWluOyBjaGFyc2V0PXV0Zi04DQpDb250
-ZW50LVRyYW5zZmVyLUVuY29kaW5nOiBiYXNlNjQNCkNvbnRlbnQtRGlzcG9zaXRpb246IGlubGlu
-ZQ0KDQpkR1Z6ZEE9PQ0KLS1fX19fTFBITVhMWk1YT01STEZLU0VKQ1dfX19fDQpDb250ZW50LVR5
-cGU6IHRleHQvaHRtbDsgY2hhcnNldD11dGYtOA0KQ29udGVudC1UcmFuc2Zlci1FbmNvZGluZzog
-cXVvdGVkLXByaW50YWJsZQ0KDQo8SFRNTD48SEVBRD4NCjxNRVRBIGNvbnRlbnQ9M0QidGV4dC9o
-dG1sOyBjaGFyc2V0PTNEdXRmLTgiIGh0dHAtZXF1aXY9M0RDb250ZW50LVR5cGU+DQo8TUVUQSBu
-YW1lPTNER0VORVJBVE9SIGNvbnRlbnQ9M0QiTVNIVE1MIDguMDAuNzYwMS4xNzY5OSI+PC9IRUFE
-Pg0KPEJPRFkgc3R5bGU9M0QiTUFSR0lOOiA0cHggNHB4IDFweDsgRk9OVDogMTBwdCBTZWdvZSBV
-SSI+dGVzdDwvQk9EWT48L0hUTUw+DQotLV9fX19MUEhNWExaTVhPTVJMRktTRUpDV19fX18tLQ0K
+TWltZS1WZXJzaW9uOiAxLjAKWC1NYWlsZXI6IEdyb3VwV2lzZSAyMDEyClN1YmplY3Q6IHRlc3Qy
+CkRhdGU6IFRodSwgMjcgSnVuIDIwMTMgMTM6Mjc6MTIgKzAyMDAKTWVzc2FnZS1JRDogPDUxQ0Mz
+REIwMDIwMDAwMDAwMDAwMDAwM0AkJCQ+CkZyb206ICJOb3ZlbGwgR3JvdXBXaXNlIiA8JCQkLiQk
+JC4kJCQ+CkNvbnRlbnQtVHlwZTogbXVsdGlwYXJ0L2FsdGVybmF0aXZlOyBib3VuZGFyeT0iX19f
+X0xQSE1YTFpNWE9NUkxGS1NFSkNXX19fXyIKCgotLV9fX19MUEhNWExaTVhPTVJMRktTRUpDV19f
+X18KQ29udGVudC1UeXBlOiB0ZXh0L3BsYWluOyBjaGFyc2V0PXV0Zi04CkNvbnRlbnQtVHJhbnNm
+ZXItRW5jb2Rpbmc6IGJhc2U2NApDb250ZW50LURpc3Bvc2l0aW9uOiBpbmxpbmUKCmRHVnpkREk9
+Ci0tX19fX0xQSE1YTFpNWE9NUkxGS1NFSkNXX19fXwpDb250ZW50LVR5cGU6IHRleHQvaHRtbDsg
+Y2hhcnNldD11dGYtOApDb250ZW50LVRyYW5zZmVyLUVuY29kaW5nOiBxdW90ZWQtcHJpbnRhYmxl
+Cgo8SFRNTD48SEVBRD4KPE1FVEEgY29udGVudD0zRCJ0ZXh0L2h0bWw7IGNoYXJzZXQ9M0R1dGYt
+OCIgaHR0cC1lcXVpdj0zRENvbnRlbnQtVHlwZT4KPE1FVEEgbmFtZT0zREdFTkVSQVRPUiBjb250
+ZW50PTNEIk1TSFRNTCA4LjAwLjc2MDEuMTc2OTkiPjwvSEVBRD4KPEJPRFkgc3R5bGU9M0QiTUFS
+R0lOOiA0cHggNHB4IDFweDsgRk9OVDogMTBwdCBTZWdvZSBVSSI+dGVzdDI8L0JPRFk+PC9IVE1M
+PgotLV9fX19MUEhNWExaTVhPTVJMRktTRUpDV19fX18tLQo=
 --____LPHMXLZMXOMRLFKSEJCW____
 Content-Type: text/plain; charset=us-ascii
 Content-Transfer-Encoding: quoted-printable