You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/11/27 18:02:52 UTC

[tika] branch master updated: TIKA-2510 -- Extract media files from ooxml

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d4fd659  TIKA-2510 -- Extract media files from ooxml
d4fd659 is described below

commit d4fd659ac5c3070104a85df4a535afe570b08a0e
Author: tballison <ta...@mitre.org>
AuthorDate: Mon Nov 27 13:02:34 2017 -0500

    TIKA-2510 -- Extract media files from ooxml
---
 CHANGES.txt                                             |   2 ++
 .../parser/microsoft/ooxml/AbstractOOXMLExtractor.java  |   7 ++++++-
 .../tika/parser/microsoft/ooxml/OOXMLParserTest.java    |  10 ++++++++++
 .../tika/parser/microsoft/ooxml/SXSLFExtractorTest.java |  11 +++++++++++
 .../resources/test-documents/testPPT_embeddedMP3.pptx   | Bin 0 -> 84434 bytes
 5 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index a1af97e..8c74689 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.17 - ???
 
+  * Extract media files from ooxml (TIKA-2510).
+
   * Standardize the way the Image and Video captioning 
     dockers and extraction work (TIKA-2400, GitHub-208)
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 4e1bfd6..327422e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -78,6 +78,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
 
 
     static final String RELATION_AUDIO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/audio";
+    static final String RELATION_MEDIA = "http://schemas.microsoft.com/office/2007/relationships/media";
+    static final String RELATION_VIDEO = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video";
     static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData";
 
     private static final String TYPE_OLE_OBJECT =
@@ -247,7 +249,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
             if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)
                     && TYPE_OLE_OBJECT.equals(target.getContentType())) {
                 handleEmbeddedOLE(target, handler, sourceDesc + rel.getId(), parentMetadata);
-            } else if (RELATION_AUDIO.equals(type)
+            } else if (
+                    RELATION_MEDIA.equals(type)
+                    || RELATION_VIDEO.equals(type)
+                    || RELATION_AUDIO.equals(type)
                     || PackageRelationshipTypes.IMAGE_PART.equals(type)
                     || POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type)
                     || POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 7dc455f..e45ee2a 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -1773,6 +1773,16 @@ public class OOXMLParserTest extends TikaTest {
         assertNotContained("\u3068", xml);
     }
 
+    @Test
+    public void testEmbeddedMedia() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_embeddedMP3.pptx");
+        assertEquals(4, metadataList.size());
+        assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals("audio/mpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
+    }
 }
 
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
index cad4913..8385263 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXSLFExtractorTest.java
@@ -580,4 +580,15 @@ public class SXSLFExtractorTest extends TikaTest {
         assertNotContained("chartSpace", xml);
     }
 
+    @Test
+    public void testEmbeddedMedia() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_embeddedMP3.pptx", parseContext);
+        assertEquals(4, metadataList.size());
+        assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation",
+                metadataList.get(0).get(Metadata.CONTENT_TYPE));
+        assertEquals("audio/mpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE));
+        assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE));
+
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_embeddedMP3.pptx b/tika-parsers/src/test/resources/test-documents/testPPT_embeddedMP3.pptx
new file mode 100644
index 0000000..4a16900
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_embeddedMP3.pptx differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].