You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 12:13:55 UTC
tika git commit: TIKA-2024 extract original path name from OLE1.0
embedded objects
Repository: tika
Updated Branches:
refs/heads/2.x c7a6bcac4 -> 4678d6733
TIKA-2024 extract original path name from OLE1.0 embedded objects
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/4678d673
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/4678d673
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/4678d673
Branch: refs/heads/2.x
Commit: 4678d67336fc3a6474904de0589d2f65afdb4cc0
Parents: c7a6bca
Author: tballison <ta...@mitre.org>
Authored: Wed Jun 29 08:13:49 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Wed Jun 29 08:13:49 2016 -0400
----------------------------------------------------------------------
.../parser/microsoft/AbstractPOIFSExtractor.java | 13 ++++++++++++-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 11 +++++++++--
.../tika/parser/microsoft/WordParserTest.java | 17 +++++++++++++----
.../parser/microsoft/ooxml/OOXMLParserTest.java | 17 +++++++++++++----
.../test-documents/test_recursive_embedded.doc | Bin 0 -> 31744 bytes
5 files changed, 47 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 739af69..a71be5b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.poi.hpsf.ClassID;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -28,7 +29,6 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
-import org.apache.poi.hpsf.ClassID;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.DetectorProxy;
@@ -37,6 +37,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
@@ -195,6 +196,12 @@ abstract class AbstractPOIFSExtractor {
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
}
+ if (ole.getCommand() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+ }
+ if (ole.getFileName() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+ }
byte[] data = ole.getDataBuffer();
embedded = TikaInputStream.get(data);
} catch (Ole10NativeException ex) {
@@ -204,6 +211,10 @@ abstract class AbstractPOIFSExtractor {
}
} else if (type == POIFSDocumentType.COMP_OBJ) {
try {
+ //TODO: figure out if the equivalent of OLE 1.0's
+ //getCommand() and getFileName() exist for OLE 2.0 to populate
+ //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
// Grab the contents and process
DocumentEntry contentsEntry;
try {
http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index cd1919d..67468b0 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -231,8 +231,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
&& root.hasEntry("\u0001Ole")
&& root.hasEntry("\u0001CompObj")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
- //TODO: original file paths can be stored underneath root
- //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+ //TODO: figure out if the equivalent of OLE 1.0's
+ //getCommand() and getFileName() exist for OLE 2.0 to populate
+ //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(
fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
@@ -247,6 +248,12 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
if (ole.getLabel() != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
}
+ if (ole.getCommand() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
+ }
+ if (ole.getFileName() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+ }
byte[] data = ole.getDataBuffer();
if (data != null) {
stream = TikaInputStream.get(data);
http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index bfec2ad..10e48fd 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -16,15 +16,15 @@
*/
package org.apache.tika.parser.microsoft;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertTrue;
-
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
@@ -502,5 +502,14 @@ public class WordParserTest extends TikaTest {
assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
}
+
+ @Test
+ public void testOrigSourcePath() throws Exception {
+ Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.doc").get(11);
+ assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 5159ade..d5fba31 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -16,6 +16,10 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
import javax.xml.transform.OutputKeys;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
@@ -24,15 +28,12 @@ import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.StringWriter;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
-import static java.nio.charset.StandardCharsets.UTF_8;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.io.TikaInputStream;
@@ -1227,6 +1228,14 @@ public class OOXMLParserTest extends TikaTest {
assertEquals("application/pdf", pdfMetadata.get(Metadata.CONTENT_TYPE));
}
+ @Test
+ public void testOrigSourcePath() throws Exception {
+ Metadata embed1_zip_metadata = getRecursiveJson("test_recursive_embedded.docx").get(11);
+ assertContains("C:\\Users\\tallison\\AppData\\Local\\Temp\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
+ Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/4678d673/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc b/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc
new file mode 100644
index 0000000..a894ba3
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/test_recursive_embedded.doc differ