You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/05 20:29:06 UTC

[tika] branch main updated: TIKA-4124 -- extract alternate format chunk from ooxml (#1317)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new f6290858b TIKA-4124 -- extract alternate format chunk from ooxml (#1317)
f6290858b is described below

commit f6290858bae72ed1c561ce75812c577e6b736a32
Author: Tim Allison <ta...@apache.org>
AuthorDate: Tue Sep 5 16:29:00 2023 -0400

    TIKA-4124 -- extract alternate format chunk from ooxml (#1317)
---
 .../java/org/apache/tika/metadata/TikaCoreProperties.java    |  3 ++-
 .../tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java  | 12 ++++++++++++
 .../parser/microsoft/ooxml/OOXMLContainerExtractionTest.java | 11 +++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index bbf3cd61a..a75eb8acf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -365,6 +365,7 @@ public interface TikaCoreProperties {
         FONT,//embedded font files
         THUMBNAIL, //TODO: set this in parsers that handle thumbnails
         RENDERING, //if a file has been rendered
-        VERSION //an earlier version of a file
+        VERSION, //an earlier version of a file
+        ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 55d3893e6..1475b7838 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -89,6 +89,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video";
     static final String RELATION_DIAGRAM_DATA =
             "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
+
+    static final String RELATION_ALTERNATE_FORMAT_CHUNK =
+            "http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk";
+
     protected static final String[] EMBEDDED_RELATIONSHIPS =
             new String[]{RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART,
                     POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT,
@@ -301,6 +305,14 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             if (targetURI != null) {
                 handledTarget.add(targetURI.toString());
             }
+        } else if (RELATION_ALTERNATE_FORMAT_CHUNK.equals(type)) {
+            //TODO check for targetMode=INTERNAL?
+            handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
+                    embeddedPartMetadata,
+                    TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK);
+            if (targetURI != null) {
+                handledTarget.add(targetURI.toString());
+            }
         }
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index dfe86f204..83641751f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.util.List;
 
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.Tika;
@@ -321,4 +322,14 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
             assertTrue(found, "didn't find chart in " + suffix);
         }
     }
+
+    @Test
+    @Disabled("until we can add test file to repo")
+    public void testAltFileChunk() throws Exception {
+        //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
+        //Tika is not correctly identifying rfc822, but rather, treating it as html. :(
+        List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
+        assertEquals(2, metadataList.size());
+        assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+    }
 }