You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/05 16:02:27 UTC

[tika] branch TIKA-4124 created (now 5ac98ad16)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4124
in repository https://gitbox.apache.org/repos/asf/tika.git


      at 5ac98ad16 TIKA-4124 -- extract alternate format chunk from ooxml

This branch includes the following new commits:

     new 5ac98ad16 TIKA-4124 -- extract alternate format chunk from ooxml

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-4124 -- extract alternate format chunk from ooxml

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4124
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5ac98ad1652e9828085e8a0a30fd6d520e94ff9f
Author: tballison <ta...@apache.org>
AuthorDate: Tue Sep 5 12:02:18 2023 -0400

    TIKA-4124 -- extract alternate format chunk from ooxml
---
 .../java/org/apache/tika/metadata/TikaCoreProperties.java    |  3 ++-
 .../tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java  | 12 ++++++++++++
 .../parser/microsoft/ooxml/OOXMLContainerExtractionTest.java | 11 +++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index bbf3cd61a..a75eb8acf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -365,6 +365,7 @@ public interface TikaCoreProperties {
         FONT,//embedded font files
         THUMBNAIL, //TODO: set this in parsers that handle thumbnails
         RENDERING, //if a file has been rendered
-        VERSION //an earlier version of a file
+        VERSION, //an earlier version of a file
+        ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 55d3893e6..1475b7838 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -89,6 +89,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video";
     static final String RELATION_DIAGRAM_DATA =
             "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
+
+    static final String RELATION_ALTERNATE_FORMAT_CHUNK =
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk";
+
     protected static final String[] EMBEDDED_RELATIONSHIPS =
             new String[]{RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART,
                     POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT,
@@ -301,6 +305,14 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             if (targetURI != null) {
                 handledTarget.add(targetURI.toString());
             }
+        } else if (RELATION_ALTERNATE_FORMAT_CHUNK.equals(type)) {
+            //TODO check for targetMode=INTERNAL?
+            handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
+                    embeddedPartMetadata,
+                    TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK);
+            if (targetURI != null) {
+                handledTarget.add(targetURI.toString());
+            }
         }
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index dfe86f204..83641751f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.List;
 
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.Tika;
@@ -321,4 +322,14 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
             assertTrue(found, "didn't find chart in " + suffix);
         }
     }
+
+    @Test
+    @Disabled("until we can add test file to repo")
+    public void testAltFileChunk() throws Exception {
+        //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
+        //Tika is not correctly identifying rfc822, but rather, treating it as html. :(
+        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
+        assertEquals(2, metadataList.size());
+        assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }