You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 12:10:54 UTC
tika git commit: TIKA-2024 add location extraction for OLE1.0
embedded files
Repository: tika
Updated Branches:
refs/heads/master 52f04bea6 -> a57d8364c
TIKA-2024 add location extraction for OLE1.0 embedded files
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/a57d8364
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/a57d8364
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/a57d8364
Branch: refs/heads/master
Commit: a57d8364c378778a14b205e0528e43a1f9fac417
Parents: 52f04be
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 29 08:10:45 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 29 08:10:45 2016 -0400
----------------------------------------------------------------------
.../parser/microsoft/AbstractPOIFSExtractor.java | 13 ++++++++++++-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 11 +++++++++--
.../tika/parser/microsoft/WordParserTest.java | 9 +++++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 9 +++++++++
.../test-documents/test_recursive_embedded.doc | Bin 0 -> 31744 bytes
5 files changed, 39 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index a240526..41e925d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -28,7 +29,6 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
-import org.apache.poi.hpsf.ClassID;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@@ -36,6 +36,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
@@ -194,6 +195,12 @@ abstract class AbstractPOIFSExtractor {
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
+ if (ole.getCommand() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+ }
+ if (ole.getFileName() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+ }
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
@@ -203,6 +210,10 @@ abstract class AbstractPOIFSExtractor {
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
+ //TODO: figure out if the equivalent of OLE 1.0's
+ //getCommand() and getFileName() exist for OLE 2.0 to populate
+ //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
// Grab the contents and process
DocumentEntry contentsEntry;
try {
http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index cd1919d..67468b0 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -231,8 +231,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
&& root.hasEntry("\u0001Ole")
&& root.hasEntry("\u0001CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
- //TODO: original file paths can be stored underneath root
- //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+ //TODO: figure out if the equivalent of OLE 1.0's
+ //getCommand() and getFileName() exist for OLE 2.0 to populate
+ //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(
fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
@@ -247,6 +248,12 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
}
+ if (ole.getCommand() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+ }
+ if (ole.getFileName() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+ }
byte[] data = ole.getDataBuffer();
if (data != null) {
stream = TikaInputStream.get(data);
http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index e909d27..10e48fd 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -502,5 +502,14 @@ public class WordParserTest extends TikaTest {
assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
}
+
+ @Test
+ public void testOrigSourcePath() throws Exception {
+ Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.doc").get(11);
+ assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 98e6ce9..4cb1271 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -28,6 +28,7 @@ import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
@@ -1228,6 +1229,14 @@ public class OOXMLParserTest extends TikaTest {
assertContains("Hello World", pdfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
}
+ @Test
+ public void testOrigSourcePath() throws Exception {
+ Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.docx").get(11);
+ assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/a57d8364/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc b/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc
new file mode 100644
index 0000000..a894ba3
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/test_recursive_embedded.doc differ