You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/08 15:23:09 UTC

[tika] 01/01: TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4124b
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 03d454e0a686aa6277936722d6c04a43d64aba39
Author: tballison <ta...@apache.org>
AuthorDate: Fri Sep 8 11:23:01 2023 -0400

    TIKA-4124 -- add test documents and turn on unit tests for altchunk in docx
---
 .../ooxml/OOXMLContainerExtractionTest.java         |  11 -----------
 .../resources/test-documents/testAltChunkHTML.docx  | Bin 0 -> 2631 bytes
 .../resources/test-documents/testAltChunkMHT.docx   | Bin 0 -> 3070 bytes
 .../parser/microsoft/ooxml/OOXMLParserTest.java     |  20 ++++++++++++++++++++
 4 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 83641751f..dfe86f204 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.List;
 
 import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.Tika;
@@ -322,14 +321,4 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
             assertTrue(found, "didn't find chart in " + suffix);
         }
     }
-
-    @Test
-    @Disabled("until we can add test file to repo")
-    public void testAltFileChunk() throws Exception {
-        //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
-        //Tika is not correctly identifying rfc822, but rather, treating it as html. :(
-        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
-        assertEquals(2, metadataList.size());
-        assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
-    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx
new file mode 100644
index 000000000..0a37c1dff
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkHTML.docx differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx
new file mode 100644
index 000000000..f58134ecf
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testAltChunkMHT.docx differ
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 65f14f169..36038a8ca 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -122,4 +122,24 @@ public class OOXMLParserTest extends TikaTest {
         //TIKA_2446
         getRecursiveMetadata("testZIP_corrupted_oom.zip");
     }
+
+    @Test
+    public void testAltFileMHTChunk() throws Exception {
+        //test file with permission from:
+        // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
+        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
+        assertEquals(3, metadataList.size());
+        assertContains("Example of a table",
+                metadataList.get(2).get(TikaCoreProperties.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testAltFileHTMLChunk() throws Exception {
+        //test file with permission from:
+        // https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_HTML_container.docx
+        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkHTML.docx");
+        assertEquals(2, metadataList.size());
+        assertContains("Example of a table",
+                metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }