You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 12:10:54 UTC

tika git commit: TIKA-2024 add location extraction for OLE1.0 embedded files

Repository: tika
Updated Branches:
  refs/heads/master 52f04bea6 -> a57d8364c


TIKA-2024 add location extraction for OLE1.0 embedded files


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a57d8364
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a57d8364
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a57d8364

Branch: refs/heads/master
Commit: a57d8364c378778a14b205e0528e43a1f9fac417
Parents: 52f04be
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 29 08:10:45 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 29 08:10:45 2016 -0400

----------------------------------------------------------------------
 .../parser/microsoft/AbstractPOIFSExtractor.java   |  13 ++++++++++++-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    |  11 +++++++++--
 .../tika/parser/microsoft/WordParserTest.java      |   9 +++++++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |   9 +++++++++
 .../test-documents/test_recursive_embedded.doc     | Bin 0 -> 31744 bytes
 5 files changed, 39 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index a240526..41e925d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.ClassID;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -28,7 +29,6 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
-import org.apache.poi.hpsf.ClassID;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@@ -36,6 +36,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeType;
 import org.apache.tika.mime.MimeTypeException;
@@ -194,6 +195,12 @@ abstract class AbstractPOIFSExtractor {
                     if (ole.getLabel() != null) {
                         metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
                     }
+                    if (ole.getCommand() != null) {
+                        metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+                    }
+                    if (ole.getFileName() != null) {
+                        metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+                    }
                     byte[] data = ole.getDataBuffer();
                     embedded = TikaInputStream.get(data);
                 } catch (Ole10NativeException ex) {
@@ -203,6 +210,10 @@ abstract class AbstractPOIFSExtractor {
                 }
             } else if (type == POIFSDocumentType.COMP_OBJ) {
                 try {
+                    //TODO: figure out if the equivalent of OLE 1.0's
+                    //getCommand() and getFileName() exist for OLE 2.0 to populate
+                    //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
                     // Grab the contents and process
                     DocumentEntry contentsEntry;
                     try {

http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index cd1919d..67468b0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -231,8 +231,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                     && root.hasEntry("\u0001Ole")
                     && root.hasEntry("\u0001CompObj")) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
-                //TODO: original file paths can be stored underneath root
-                //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+                //TODO: figure out if the equivalent of OLE 1.0's
+                //getCommand() and getFileName() exist for OLE 2.0 to populate
+                //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                 stream = TikaInputStream.get(
                         fs.createDocumentInputStream("CONTENTS"));
                 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
@@ -247,6 +248,12 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                 if (ole.getLabel() != null) {
                     metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
                 }
+                if (ole.getCommand() != null) {
+                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+                }
+                if (ole.getFileName() != null) {
+                    metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+                }
                 byte[] data = ole.getDataBuffer();
                 if (data != null) {
                     stream = TikaInputStream.get(data);

http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index e909d27..10e48fd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -502,5 +502,14 @@ public class WordParserTest extends TikaTest {
         assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
         assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
     }
+
+    @Test
+    public void testOrigSourcePath() throws Exception {
+        Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.doc").get(11);
+        assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+        assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 98e6ce9..4cb1271 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -28,6 +28,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.io.StringWriter;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@@ -1228,6 +1229,14 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("Hello World", pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
     }
 
+    @Test
+    public void testOrigSourcePath() throws Exception {
+        Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.docx").get(11);
+        assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+        assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+                Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+    }
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc b/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc
new file mode 100644
index 0000000..a894ba3
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc differ