You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 12:13:55 UTC

tika git commit: TIKA-2024 extract original path name from OLE1.0 embedded objects

Repository: tika
Updated Branches:
  refs/heads/2.x c7a6bcac4 -> 4678d6733


TIKA-2024 extract original path name from OLE1.0 embedded objects


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4678d673
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4678d673
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4678d673

Branch: refs/heads/2.x
Commit: 4678d67336fc3a6474904de0589d2f65afdb4cc0
Parents: c7a6bca
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 29 08:13:49 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 29 08:13:49 2016 -0400

----------------------------------------------------------------------
 .../parser/microsoft/AbstractPOIFSExtractor.java  |  13 ++++++++++++-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java   |  11 +++++++++--
 .../tika/parser/microsoft/WordParserTest.java     |  17 +++++++++++++----
 .../parser/microsoft/ooxml/OOXMLParserTest.java   |  17 +++++++++++++----
 .../test-documents/test_recursive_embedded.doc    | Bin 0 -> 31744 bytes
 5 files changed, 47 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 739af69..a71be5b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.ClassID;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -28,7 +29,6 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
-import org.apache.poi.hpsf.ClassID;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.detect.DetectorProxy;
@@ -37,6 +37,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
@@ -195,6 +196,12 @@ abstract class AbstractPOIFSExtractor {
                     if (ole.getLabel() != null) {
                         metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
                     }
+                    if (ole.getCommand() != null) {
+                        metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+                    }
+                    if (ole.getFileName() != null) {
+                        metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+                    }
                     byte[] data = ole.getDataBuffer();
                     embedded = TikaInputStream.get(data);
                 } catch (Ole10NativeException ex) {
@@ -204,6 +211,10 @@ abstract class AbstractPOIFSExtractor {
                 }
             } else if (type == POIFSDocumentType.COMP_OBJ) {
                 try {
+                    //TODO: figure out if the equivalent of OLE 1.0's
+                    //getCommand() and getFileName() exist for OLE 2.0 to populate
+                    //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
                     // Grab the contents and process
                     DocumentEntry contentsEntry;
                     try {

http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index cd1919d..67468b0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -231,8 +231,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                     && root.hasEntry("\u0001Ole")
                     && root.hasEntry("\u0001CompObj")) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
-                //TODO: original file paths can be stored underneath root
-                //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+                //TODO: figure out if the equivalent of OLE 1.0's
+                //getCommand() and getFileName() exist for OLE 2.0 to populate
+                //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                 stream = TikaInputStream.get(
                         fs.createDocumentInputStream("CONTENTS"));
                 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
@@ -247,6 +248,12 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                 if (ole.getLabel() != null) {
                     metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
                 }
+                if (ole.getCommand() != null) {
+                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+                }
+                if (ole.getFileName() != null) {
+                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+                }
                 byte[] data = ole.getDataBuffer();
                 if (data != null) {
                     stream = TikaInputStream.get(data);

http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index bfec2ad..10e48fd 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -16,15 +16,15 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
@@ -502,5 +502,14 @@ public class WordParserTest extends TikaTest {
         assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
         assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
     }
+
+    @Test
+    public void testOrigSourcePath() throws Exception {
+        Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.doc").get(11);
+        assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+        assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 5159ade..d5fba31 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,6 +16,10 @@
  */
 package org.apache.tika.parser.microsoft.ooxml;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
@@ -24,15 +28,12 @@ import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
 import org.apache.tika.TikaTest;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.io.TikaInputStream;
@@ -1227,6 +1228,14 @@ public class OOXMLParserTest extends TikaTest {
         assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
     }
 
+    @Test
+    public void testOrigSourcePath() throws Exception {
+        Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.docx").get(11);
+        assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+        assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+    }
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc b/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc
new file mode 100644
index 0000000..a894ba3
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc differ