You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 00:54:51 UTC

[3/5] tika git commit: TIKA-2024 extract original file name/path where possible, take 1

TIKA-2024 extract original file name/path where possible, take 1


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e62f2305
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e62f2305
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e62f2305

Branch: refs/heads/2.x
Commit: e62f2305783763aad0a2c587f96b162ae4be1c36
Parents: c84855f
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:35:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:35:27 2016 -0400

----------------------------------------------------------------------
 .../tika/metadata/TikaCoreProperties.java       |   7 ++
 .../parser/apple/AppleSingleFileParser.java     |   4 +-
 .../parser/microsoft/JackcessExtractor.java     |   4 +-
 .../tika/parser/microsoft/OfficeParser.java     |   2 +-
 .../tika/parser/microsoft/WordExtractor.java    |  22 +++-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java |   2 +
 .../microsoft/xml/AbstractXML2003Parser.java    |   6 +-
 .../tika/parser/microsoft/xml/WordMLParser.java |  53 +++++++-
 .../tika/parser/rtf/RTFObjDataParser.java       |   2 +
 .../parser/apple/AppleSingleFileParserTest.java |   3 +
 .../tika/parser/microsoft/WordParserTest.java   |  16 ++-
 .../parser/microsoft/xml/XML2003ParserTest.java |   7 +-
 .../apache/tika/parser/rtf/RTFParserTest.java   | 124 +++++++------------
 .../tika/parser/pdf/AbstractPDF2XHTML.java      |  22 ++--
 .../test-documents/testAppleSingleFile.pdf      | Bin 54926 -> 1893 bytes
 .../test-documents/testExcel_embeddedPDF.xls    | Bin 0 -> 38400 bytes
 .../test-documents/testExcel_embeddedPDF.xlsx   | Bin 0 -> 25602 bytes
 .../test-documents/testPPT_EmbeddedPDF.ppt      | Bin 0 -> 187392 bytes
 .../test-documents/testPPT_EmbeddedPDF.pptx     | Bin 0 -> 108637 bytes
 19 files changed, 170 insertions(+), 104 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 5052fbc..f4b97dd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -76,6 +76,13 @@ public interface TikaCoreProperties {
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX+"warn");
 
     /**
+     * Some file formats can store information about their original
+     * file name/location or about their attachment's original file name/location.
+     */
+    public static final Property ORIGINAL_RESOURCE_NAME =
+            Property.internalTextBag(TIKA_META_PREFIX+"origResourceName");
+
+    /**
      * This is currently used to identify Content-Type that may be
      * included within a document, such as in html documents
      * (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">)

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index 3f548ca..0f3c044 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -32,6 +32,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.EndianUtils;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
@@ -133,8 +134,7 @@ public class AppleSingleFileParser extends AbstractParser {
                 IOUtils.readFully(stream, buffer);
                 bytesRead += f.length;
                 String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
-                //TODO: figure out correct metadata key
-                //embeddedMetadata.set(TikaCoreProperties.IDENTIFIER, originalFileName);
+                embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
             } else if (f.entryId != DATA_FORK) {
                 IOUtils.skipFully(stream, f.length);
                 bytesRead += f.length;

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 345dd24..fb8a2c2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -301,7 +301,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
                 break;
             case SIMPLE_PACKAGE:
                 OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
-
+                //TODO: find test file that has this kind of attachment
+                //and see if getFilePath or getLocalFilePath is meaningful
+                //for TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                 handleEmbeddedResource(
                         TikaInputStream.get(spc.getStream()),
                         spc.getFileName(),//filename

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b8deb99..f5f9f3e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -143,7 +143,7 @@ public class OfficeParser extends AbstractParser {
                 xhtml.element("p", publisherTextExtractor.getText());
                 break;
             case WORDDOCUMENT:
-                new WordExtractor(context).parse(root, xhtml);
+                new WordExtractor(context, metadata).parse(root, xhtml);
                 break;
             case POWERPOINT:
                 new HSLFExtractor(context).parse(root, xhtml);

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4c950fa..8d36115 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -34,6 +34,8 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.model.FieldsDocumentPart;
 import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.SavedByEntry;
+import org.apache.poi.hwpf.model.SavedByTable;
 import org.apache.poi.hwpf.model.StyleDescription;
 import org.apache.poi.hwpf.usermodel.CharacterRun;
 import org.apache.poi.hwpf.usermodel.Field;
@@ -50,6 +52,8 @@ import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -79,8 +83,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
     private boolean curBold;
     private boolean curItalic;
 
-    public WordExtractor(ParseContext context) {
+    private final Metadata metadata;
+
+    public WordExtractor(ParseContext context, Metadata metadata) {
         super(context);
+        this.metadata = metadata;
     }
 
     private static int countParagraphs(Range... ranges) {
@@ -146,6 +153,9 @@ public class WordExtractor extends AbstractPOIFSExtractor {
             parseWord6(root, xhtml);
             return;
         }
+
+        extractSavedByMetadata(document);
+
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
                 new org.apache.poi.hwpf.extractor.WordExtractor(document);
         HeaderStories headerFooter = new HeaderStories(document);
@@ -212,6 +222,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
         }
     }
 
+    private void extractSavedByMetadata(HWPFDocument document) {
+        SavedByTable savedByTable = document.getSavedByTable();
+        if (savedByTable == null) {
+            return;
+        }
+        for (SavedByEntry sbe : savedByTable.getEntries()) {
+            metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+        }
+    }
+
     private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
                                     PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
             throws SAXException, IOException, TikaException {

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 17e629f..84e9752 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -232,6 +232,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
                     && root.hasEntry("\u0001CompObj")
                     && root.hasEntry("\u0003ObjInfo")) {
                 // TIKA-704: OLE 2.0 embedded non-Office document?
+                //TODO: original file paths can be stored underneath root
+                //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
                 stream = TikaInputStream.get(
                         fs.createDocumentInputStream("CONTENTS"));
                 if (embeddedExtractor.shouldParseEmbedded(metadata)) {

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index a12f25e..4630219 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -16,6 +16,9 @@
  */
 package org.apache.tika.parser.microsoft.xml;
 
+import java.io.IOException;
+import java.io.InputStream;
+
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -37,14 +40,13 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
-import java.io.IOException;
-import java.io.InputStream;
 
 public abstract class AbstractXML2003Parser extends AbstractParser {
 
     final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
     final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
     final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet";
+    final static String MS_VML_URN = "urn:schemas-microsoft-com:vml";
     final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml";
     final static Attributes EMPTY_ATTRS = new AttributesImpl();
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 28b33e4..67d13a9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -31,6 +31,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.TeeContentHandler;
@@ -186,10 +187,12 @@ public class WordMLParser extends AbstractXML2003Parser {
     private class PictHandler extends DefaultHandler {
         final StringBuilder buffer = new StringBuilder();
         final ContentHandler handler;
+        byte[] rawBytes = null;
         EmbeddedDocumentExtractor embeddedDocumentExtractor;
         boolean inPict = false;
         boolean inBin = false;
         String pictName = null;
+        String pictSource = null;
         final Base64 base64 = new Base64();
 
         public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
@@ -210,6 +213,24 @@ public class WordMLParser extends AbstractXML2003Parser {
                         pictName = pictName.replaceFirst("wordml://", "");
                     }
                 }
+            } else if (MS_VML_URN.equals(uri)) {
+                if (localName.equals("imagedata")) {
+                    //src is an internal designator with an extension
+                    String src = attrs.getValue("", "src");
+                    //title appears to be the original file name
+                    String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
+                    if (title != null && ! title.equals("")) {
+                        if (src != null) {
+                            //take the extention from the src and append it to the title
+                            int i = src.lastIndexOf(".");
+                            if (i > -1 && i +1 < src.length()) {
+                                String ext = src.substring(i);
+                                title += ext;
+                            }
+                        }
+                        pictSource = title;
+                    }
+                }
             }
         }
 
@@ -227,6 +248,13 @@ public class WordMLParser extends AbstractXML2003Parser {
             if (!WORD_ML_URL.equals(uri)) {
                 return;
             }
+            //somewhat tricky...
+            //can't just dump bin_data at the end of the
+            //bin_data element because there may be metadata
+            //after it, if it is within a pict element
+            //<pict><binData></binData><imagedata/></pict>.
+            //However, if you aren't in a pict (say docOLEdata), then do dump binary
+            //data at the end of the bin data.
             if (PICT.equals(localName)) {
                 inPict = false;
                 AttributesImpl attrs = new AttributesImpl();
@@ -238,17 +266,29 @@ public class WordMLParser extends AbstractXML2003Parser {
                         IMG, IMG, attrs);
                 handler.endElement(
                         XHTMLContentHandler.XHTML, IMG, IMG);
+                handleEmbedded();
             } else if (BIN_DATA.equals(localName)) {
                 inBin = false;
-                byte[] bytes = base64.decode(buffer.toString());
-                if (bytes == null) {
-                    return;
+                rawBytes = base64.decode(buffer.toString());
+                //reset
+                buffer.setLength(0);
+
+                if (! inPict) {
+                    handleEmbedded();
                 }
-                try (TikaInputStream is = TikaInputStream.get(bytes)) {
+            }
+        }
+
+        private void handleEmbedded() throws SAXException {
+            if (rawBytes != null) {
+                try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
                     Metadata metadata = new Metadata();
                     if (pictName != null) {
                         metadata.set(Metadata.RESOURCE_NAME_KEY, pictName);
                     }
+                    if (pictSource != null) {
+                        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
+                    }
                     if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
                         embeddedDocumentExtractor.parseEmbedded(is,
                                 handler, metadata, false);
@@ -256,8 +296,11 @@ public class WordMLParser extends AbstractXML2003Parser {
                 } catch (IOException e) {
                     //log
                 }
-                buffer.setLength(0);
             }
+            //reset
+            pictName = null;
+            pictSource = null;
+            rawBytes = null;
         }
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 147d2e8..6426687 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -40,6 +40,7 @@ import org.apache.tika.io.EndianUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
 
 /**
@@ -242,6 +243,7 @@ class RTFObjDataParser {
             fileNameToUse = displayName == null ? "" : displayName;
             pathToUse = ansiFilePath == null ? "" : ansiFilePath;
         }
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse);
         metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
         metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
index c80c94a..bd8156d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
@@ -25,6 +25,7 @@ import java.util.List;
 
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.junit.Test;
 
 public class AppleSingleFileParserTest extends TikaTest {
@@ -36,5 +37,7 @@ public class AppleSingleFileParserTest extends TikaTest {
         assertContains(AppleSingleFileParser.class.getName(),
                 Arrays.asList(list.get(0).getValues("X-Parsed-By")));
         assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
+        assertEquals("fltsyllabussortie2rev1_2.pdf", list.get(1).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 138120e..9d9d372 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -16,13 +16,15 @@
  */
 package org.apache.tika.parser.microsoft;
 
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 
-import java.io.InputStream;
-import java.util.Locale;
-
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
@@ -492,5 +494,13 @@ public class WordParserTest extends TikaTest {
         assertEquals("manager1", managers[0]);
         assertEquals("manager2", managers[1]);
     }
+
+    @Test
+    public void testOrigLocation() throws Exception {
+        Metadata metadata = getXML("testException2.doc").metadata;
+        List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+        assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
+        assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 04530ce..510cd32 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -17,7 +17,6 @@
 package org.apache.tika.parser.microsoft.xml;
 
 import org.apache.tika.TikaTest;
-
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -26,11 +25,11 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.junit.Test;
 
+import static org.junit.Assert.assertEquals;
+
 import java.util.Arrays;
 import java.util.List;
 
-import static org.junit.Assert.assertEquals;
-
 public class XML2003ParserTest extends TikaTest {
 
     @Test
@@ -80,6 +79,8 @@ public class XML2003ParserTest extends TikaTest {
         assertContains("R1 c1 R1 c2", txt);
         assertNotContained("footnoteFigure", txt);
         assertContains("footnote Figure", txt);
+
+        assertEquals("testJPEG_EXIF.jpg", list.get(7).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
         assertEquals("image/jpeg", list.get(7).get(Metadata.CONTENT_TYPE));
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index dc75be5..d80842b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -18,13 +18,13 @@ package org.apache.tika.parser.rtf;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
 import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.commons.io.FilenameUtils;
@@ -381,83 +381,45 @@ public class RTFParserTest extends TikaTest {
     // TIKA-1010
     @Test
     public void testEmbeddedMonster() throws Exception {
-        Set<MediaType> skipTypes = new HashSet<MediaType>();
-        skipTypes.add(MediaType.parse("application/x-emf"));
-        skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
-
-        List<String> trueNames = new ArrayList<String>();
-        trueNames.add("file_0.doc");
-        trueNames.add("Hw.txt");
-        trueNames.add("file_1.xlsx");
-        trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip");
-        trueNames.add("html-within-zip.zip");
-        trueNames.add("text.html");
-        trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html");
-        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-        trueNames.add("file_2.xls");
-        trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg");
-        trueNames.add("file_3.pdf");
-        trueNames.add("file_4.ppt");
-        trueNames.add("file_5.pptx");
-        trueNames.add("thumbnail.jpeg");
-        trueNames.add("file_6.doc");
-        trueNames.add("file_7.doc");
-        trueNames.add("file_8.docx");
-        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
-        List<String> trueTypes = new ArrayList<String>();
-        trueTypes.add("application/msword");
-        trueTypes.add("text/plain");
-        trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-        trueTypes.add("application/zip");
-        trueTypes.add("application/zip");
-        trueTypes.add("text/html");
-        trueTypes.add("text/html");
-        trueTypes.add("image/jpeg");
-        trueTypes.add("application/vnd.ms-excel");
-        trueTypes.add("application/vnd.ms-outlook");
-        trueTypes.add("application/pdf");
-        trueTypes.add("application/vnd.ms-powerpoint");
-        trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
-        trueTypes.add("image/jpeg");
-        trueTypes.add("application/msword");
-        trueTypes.add("application/msword");
-        trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
-        trueTypes.add("image/jpeg");
-
-        TrackingHandler tracker = new TrackingHandler(skipTypes);
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
-        }
 
-        assertEquals(trueNames.size(), tracker.filenames.size());
-        assertEquals(trueTypes.size(), tracker.mediaTypes.size());
-        for (int i = 0; i < tracker.filenames.size(); i++) {
-            String expectedName = trueNames.get(i);
-            if (expectedName == null) {
-                assertNull(tracker.filenames.get(i));
-            } else {
-                assertNotNull(tracker.filenames.get(i));
-                //necessary to getName() because MSOffice extractor includes
-                //directory: _1457338524/HW.txt
-                assertEquals("filename equals ",
-                        expectedName, FilenameUtils.getName(tracker.filenames.get(i)));
-            }
-            assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString());
-        }
-
-        tracker = new TrackingHandler();
-        try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
-            ContainerExtractor ex = new ParserContainerExtractor();
-            assertEquals(true, ex.isSupported(tis));
-            ex.extract(tis, ex, tracker);
+        Map<Integer, Pair> expected = new HashMap<>();
+        expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+        expected.put(3, new Pair("file_0.doc", "application/msword"));
+        expected.put(6, new Pair("file_1.xlsx",
+                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+        expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+        expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+        expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+        expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+        expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+        expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+        expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+        expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+        expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+        expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+        expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+        expected.put(36, new Pair("file_6.doc", "application/msword"));
+        expected.put(39, new Pair("file_7.doc", "application/msword"));
+        expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+        expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+        List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+        assertEquals(48, metadataList.size());
+        for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+            Metadata metadata = metadataList.get(e.getKey());
+            Pair p = e.getValue();
+            assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+            //necessary to getName() because MSOffice extractor includes
+            //directory: _1457338524/HW.txt
+            assertEquals("filename equals ",
+                    p.fileName, FilenameUtils.getName(
+                            metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+            assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
         }
-        assertEquals(47, tracker.filenames.size());
-        assertEquals("thumbnail_26.emf", tracker.filenames.get(45));
-        assertEquals("thumbnail_27.wmf", tracker.filenames.get(46));
+        assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+                metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
     }
 
     //TIKA-1010 test regular (not "embedded") images/picts
@@ -537,4 +499,12 @@ public class RTFParserTest extends TikaTest {
         assertEquals(2, tracker.filenames.size());
     }
 
+    private static class Pair {
+        final String fileName;
+        final String mimeType;
+        Pair(String fileName, String mimeType) {
+            this.fileName = fileName;
+            this.mimeType = mimeType;
+        }
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 9a73bde..832b06e 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,6 +33,8 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.TreeMap;
 
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
 import javax.xml.stream.XMLStreamException;
 import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.commons.io.IOUtils;
@@ -176,7 +178,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
-    private void extractMultiOSPDEmbeddedFiles(String defaultName,
+    private void extractMultiOSPDEmbeddedFiles(String displayName,
                                        PDComplexFileSpecification spec,
                                        EmbeddedDocumentExtractor extractor) throws IOException,
             SAXException, TikaException {
@@ -185,13 +187,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             return;
         }
         //current strategy is to pull all, not just first non-null
-        extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
-        extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+        extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
     }
 
-    private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+    private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
+                                       String fileName, PDEmbeddedFile file,
                                        EmbeddedDocumentExtractor extractor)
             throws SAXException, IOException, TikaException {
 
@@ -199,8 +202,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             //skip silently
             return;
         }
-
-        fileName = (fileName == null) ? defaultName : fileName;
+        
+        fileName = (fileName == null) ? displayName : fileName;
 
         // TODO: other metadata?
         Metadata metadata = new Metadata();
@@ -209,6 +212,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+        metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
 
         if (extractor.shouldParseEmbedded(metadata)) {
             TikaInputStream stream = null;
@@ -289,7 +293,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
                     PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
                     try {
-                        extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+                        extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
                     } catch (SAXException e) {
                         throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
                     } catch (TikaException e) {

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
index a385313..a407ded 100644
Binary files a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf and b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt differ

http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx differ