You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/09/05 16:02:28 UTC
[tika] 01/01: TIKA-4124 -- extract alternate format chunk from ooxml
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4124
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5ac98ad1652e9828085e8a0a30fd6d520e94ff9f
Author: tballison <ta...@apache.org>
AuthorDate: Tue Sep 5 12:02:18 2023 -0400
TIKA-4124 -- extract alternate format chunk from ooxml
---
.../java/org/apache/tika/metadata/TikaCoreProperties.java | 3 ++-
.../tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java | 12 ++++++++++++
.../parser/microsoft/ooxml/OOXMLContainerExtractionTest.java | 11 +++++++++++
3 files changed, 25 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index bbf3cd61a..a75eb8acf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -365,6 +365,7 @@ public interface TikaCoreProperties {
FONT,//embedded font files
THUMBNAIL, //TODO: set this in parsers that handle thumbnails
RENDERING, //if a file has been rendered
- VERSION //an earlier version of a file
+ VERSION, //an earlier version of a file
+ ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 55d3893e6..1475b7838 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -89,6 +89,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/video";
static final String RELATION_DIAGRAM_DATA =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
+
+ static final String RELATION_ALTERNATE_FORMAT_CHUNK =
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk";
+
protected static final String[] EMBEDDED_RELATIONSHIPS =
new String[]{RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART,
POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT,
@@ -301,6 +305,14 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (targetURI != null) {
handledTarget.add(targetURI.toString());
}
+ } else if (RELATION_ALTERNATE_FORMAT_CHUNK.equals(type)) {
+ //TODO check for targetMode=INTERNAL?
+ handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(),
+ embeddedPartMetadata,
+ TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK);
+ if (targetURI != null) {
+ handledTarget.add(targetURI.toString());
+ }
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index dfe86f204..83641751f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.apache.tika.Tika;
@@ -321,4 +322,14 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction
assertTrue(found, "didn't find chart in " + suffix);
}
}
+
+ @Test
+ @Disabled("until we can add test file to repo")
+ public void testAltFileChunk() throws Exception {
+ //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx
+ //Tika is not correctly identifying rfc822, but rather, treating it as html. :(
+ List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx");
+ assertEquals(2, metadataList.size());
+ assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
+ }
}