You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/08 15:23:08 UTC

[tika] branch TIKA-4124b created (now 03d454e0a)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4124b
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 03d454e0a TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx

This branch includes the following new commits:

     new 03d454e0a TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4124b
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 03d454e0a686aa6277936722d6c04a43d64aba39
Author: tballison <ta...@apache.org>
AuthorDate: Fri Sep 8 11:23:01 2023 -0400

    TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx
---
 .../ooxml/OOXMLContainerExtractionTest.java         |  11 -----------
 .../resources/test-documents/testAltChunkHTML.docx  | Bin 0 -> 2631 bytes
 .../resources/test-documents/testAltChunkMHT.docx   | Bin 0 -> 3070 bytes
 .../parser/microsoft/ooxml/OOXMLParserTest.java     |  20 ++++++++++++++++++++
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 83641751f..dfe86f204 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.List;
 
 import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.Tika;
@@ -322,14 +321,4 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
             assertTrue(found, "didn't find chart in " + suffix);
         }
     }
-
-    @Test
-    @Disabled("until we can add test file to repo")
-    public void testAltFileChunk() throws Exception {
-        //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
-        //Tika is not correctly identifying rfc822, but rather, treating it as html. :(
-        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
-        assertEquals(2, metadataList.size());
-        assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
-    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx
new file mode 100644
index 000000000..0a37c1dff
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx
new file mode 100644
index 000000000..f58134ecf
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 65f14f169..36038a8ca 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -122,4 +122,24 @@ public class OOXMLParserTest extends TikaTest {
         //TIKA_2446
         getRecursiveMetadata("testZIP_corrupted_oom.zip");
     }
+
+    @Test
+    public void testAltFileMHTChunk() throws Exception {
+        //test file with permission from:
+        // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
+        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
+        assertEquals(3, metadataList.size());
+        assertContains("Example of a table",
+                metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testAltFileHTMLChunk() throws Exception {
+        //test file with permission from:
+        // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_HTML_container.docx
+        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkHTML.docx");
+        assertEquals(2, metadataList.size());
+        assertContains("Example of a table",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }