You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 00:54:51 UTC
[3/5] tika git commit: TIKA-2024 extract original file name/path
where possible, take 1
TIKA-2024 extract original file name/path where possible, take 1
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/e62f2305
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/e62f2305
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/e62f2305
Branch: refs/heads/2.x
Commit: e62f2305783763aad0a2c587f96b162ae4be1c36
Parents: c84855f
Author: tballison <ta...@mitre.org>
Authored: Tue Jun 28 20:35:27 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jun 28 20:35:27 2016 -0400
----------------------------------------------------------------------
.../tika/metadata/TikaCoreProperties.java | 7 ++
.../parser/apple/AppleSingleFileParser.java | 4 +-
.../parser/microsoft/JackcessExtractor.java | 4 +-
.../tika/parser/microsoft/OfficeParser.java | 2 +-
.../tika/parser/microsoft/WordExtractor.java | 22 +++-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 2 +
.../microsoft/xml/AbstractXML2003Parser.java | 6 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 53 +++++++-
.../tika/parser/rtf/RTFObjDataParser.java | 2 +
.../parser/apple/AppleSingleFileParserTest.java | 3 +
.../tika/parser/microsoft/WordParserTest.java | 16 ++-
.../parser/microsoft/xml/XML2003ParserTest.java | 7 +-
.../apache/tika/parser/rtf/RTFParserTest.java | 124 +++++++------------
.../tika/parser/pdf/AbstractPDF2XHTML.java | 22 ++--
.../test-documents/testAppleSingleFile.pdf | Bin 54926 -> 1893 bytes
.../test-documents/testExcel_embeddedPDF.xls | Bin 0 -> 38400 bytes
.../test-documents/testExcel_embeddedPDF.xlsx | Bin 0 -> 25602 bytes
.../test-documents/testPPT_EmbeddedPDF.ppt | Bin 0 -> 187392 bytes
.../test-documents/testPPT_EmbeddedPDF.pptx | Bin 0 -> 108637 bytes
19 files changed, 170 insertions(+), 104 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 5052fbc..f4b97dd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -76,6 +76,13 @@ public interface TikaCoreProperties {
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX+"warn");
/**
+ * Some file formats can store information about their original
+ * file name/location or about their attachment's original file name/location.
+ */
+ public static final Property ORIGINAL_RESOURCE_NAME =
+ Property.internalTextBag(TIKA_META_PREFIX+"origResourceName");
+
+ /**
* This is currently used to identify Content-Type that may be
* included within a document, such as in html documents
* (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">)
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index 3f548ca..0f3c044 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -32,6 +32,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
@@ -133,8 +134,7 @@ public class AppleSingleFileParser extends AbstractParser {
IOUtils.readFully(stream, buffer);
bytesRead += f.length;
String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII);
- //TODO: figure out correct metadata key
- //embeddedMetadata.set(TikaCoreProperties.IDENTIFIER, originalFileName);
+ embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, originalFileName);
} else if (f.entryId != DATA_FORK) {
IOUtils.skipFully(stream, f.length);
bytesRead += f.length;
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
index 345dd24..fb8a2c2 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/JackcessExtractor.java
@@ -301,7 +301,9 @@ class JackcessExtractor extends AbstractPOIFSExtractor {
break;
case SIMPLE_PACKAGE:
OleBlob.SimplePackageContent spc = (OleBlob.SimplePackageContent) content;
-
+ //TODO: find test file that has this kind of attachment
+ //and see if getFilePath or getLocalFilePath is meaningful
+ //for TikaCoreProperties.ORIGINAL_RESOURCE_NAME
handleEmbeddedResource(
TikaInputStream.get(spc.getStream()),
spc.getFileName(),//filename
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b8deb99..f5f9f3e 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -143,7 +143,7 @@ public class OfficeParser extends AbstractParser {
xhtml.element("p", publisherTextExtractor.getText());
break;
case WORDDOCUMENT:
- new WordExtractor(context).parse(root, xhtml);
+ new WordExtractor(context, metadata).parse(root, xhtml);
break;
case POWERPOINT:
new HSLFExtractor(context).parse(root, xhtml);
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 4c950fa..8d36115 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -34,6 +34,8 @@ import org.apache.poi.hwpf.OldWordFileFormatException;
import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.FieldsDocumentPart;
import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.model.SavedByEntry;
+import org.apache.poi.hwpf.model.SavedByTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Field;
@@ -50,6 +52,8 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -79,8 +83,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
private boolean curBold;
private boolean curItalic;
- public WordExtractor(ParseContext context) {
+ private final Metadata metadata;
+
+ public WordExtractor(ParseContext context, Metadata metadata) {
super(context);
+ this.metadata = metadata;
}
private static int countParagraphs(Range... ranges) {
@@ -146,6 +153,9 @@ public class WordExtractor extends AbstractPOIFSExtractor {
parseWord6(root, xhtml);
return;
}
+
+ extractSavedByMetadata(document);
+
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
new org.apache.poi.hwpf.extractor.WordExtractor(document);
HeaderStories headerFooter = new HeaderStories(document);
@@ -212,6 +222,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
}
+ private void extractSavedByMetadata(HWPFDocument document) {
+ SavedByTable savedByTable = document.getSavedByTable();
+ if (savedByTable == null) {
+ return;
+ }
+ for (SavedByEntry sbe : savedByTable.getEntries()) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, sbe.getSaveLocation());
+ }
+ }
+
private void handleHeaderFooter(Range[] ranges, String type, HWPFDocument document,
PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 17e629f..84e9752 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -232,6 +232,8 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor {
&& root.hasEntry("\u0001CompObj")
&& root.hasEntry("\u0003ObjInfo")) {
// TIKA-704: OLE 2.0 embedded non-Office document?
+ //TODO: original file paths can be stored underneath root
+ //figure out how to extract that info for: TikaCoreProperties.ORIGINAL_RESOURCE_NAME
stream = TikaInputStream.get(
fs.createDocumentInputStream("CONTENTS"));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
index a12f25e..4630219 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/AbstractXML2003Parser.java
@@ -16,6 +16,9 @@
*/
package org.apache.tika.parser.microsoft.xml;
+import java.io.IOException;
+import java.io.InputStream;
+
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -37,14 +40,13 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import java.io.IOException;
-import java.io.InputStream;
public abstract class AbstractXML2003Parser extends AbstractParser {
final static String MS_OFFICE_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
final static String MS_DOC_PROPERTIES_URN = "urn:schemas-microsoft-com:office:office";
final static String MS_SPREADSHEET_URN = "urn:schemas-microsoft-com:office:spreadsheet";
+ final static String MS_VML_URN = "urn:schemas-microsoft-com:vml";
final static String WORD_ML_URL = "http://schemas.microsoft.com/office/word/2003/wordml";
final static Attributes EMPTY_ATTRS = new AttributesImpl();
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 28b33e4..67d13a9 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -31,6 +31,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.TeeContentHandler;
@@ -186,10 +187,12 @@ public class WordMLParser extends AbstractXML2003Parser {
private class PictHandler extends DefaultHandler {
final StringBuilder buffer = new StringBuilder();
final ContentHandler handler;
+ byte[] rawBytes = null;
EmbeddedDocumentExtractor embeddedDocumentExtractor;
boolean inPict = false;
boolean inBin = false;
String pictName = null;
+ String pictSource = null;
final Base64 base64 = new Base64();
public PictHandler(ContentHandler handler, EmbeddedDocumentExtractor embeddedDocumentExtractor) {
@@ -210,6 +213,24 @@ public class WordMLParser extends AbstractXML2003Parser {
pictName = pictName.replaceFirst("wordml://", "");
}
}
+ } else if (MS_VML_URN.equals(uri)) {
+ if (localName.equals("imagedata")) {
+ //src is an internal designator with an extension
+ String src = attrs.getValue("", "src");
+ //title appears to be the original file name
+ String title = attrs.getValue(MS_OFFICE_PROPERTIES_URN, "title");
+ if (title != null && ! title.equals("")) {
+ if (src != null) {
+ //take the extention from the src and append it to the title
+ int i = src.lastIndexOf(".");
+ if (i > -1 && i +1 < src.length()) {
+ String ext = src.substring(i);
+ title += ext;
+ }
+ }
+ pictSource = title;
+ }
+ }
}
}
@@ -227,6 +248,13 @@ public class WordMLParser extends AbstractXML2003Parser {
if (!WORD_ML_URL.equals(uri)) {
return;
}
+ //somewhat tricky...
+ //can't just dump bin_data at the end of the
+ //bin_data element because there may be metadata
+ //after it, if it is within a pict element
+ //<pict><binData></binData><imagedata/></pict>.
+ //However, if you aren't in a pict (say docOLEdata), then do dump binary
+ //data at the end of the bin data.
if (PICT.equals(localName)) {
inPict = false;
AttributesImpl attrs = new AttributesImpl();
@@ -238,17 +266,29 @@ public class WordMLParser extends AbstractXML2003Parser {
IMG, IMG, attrs);
handler.endElement(
XHTMLContentHandler.XHTML, IMG, IMG);
+ handleEmbedded();
} else if (BIN_DATA.equals(localName)) {
inBin = false;
- byte[] bytes = base64.decode(buffer.toString());
- if (bytes == null) {
- return;
+ rawBytes = base64.decode(buffer.toString());
+ //reset
+ buffer.setLength(0);
+
+ if (! inPict) {
+ handleEmbedded();
}
- try (TikaInputStream is = TikaInputStream.get(bytes)) {
+ }
+ }
+
+ private void handleEmbedded() throws SAXException {
+ if (rawBytes != null) {
+ try (TikaInputStream is = TikaInputStream.get(rawBytes)) {
Metadata metadata = new Metadata();
if (pictName != null) {
metadata.set(Metadata.RESOURCE_NAME_KEY, pictName);
}
+ if (pictSource != null) {
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
+ }
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
embeddedDocumentExtractor.parseEmbedded(is,
handler, metadata, false);
@@ -256,8 +296,11 @@ public class WordMLParser extends AbstractXML2003Parser {
} catch (IOException e) {
//log
}
- buffer.setLength(0);
}
+ //reset
+ pictName = null;
+ pictSource = null;
+ rawBytes = null;
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index 147d2e8..6426687 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -40,6 +40,7 @@ import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
/**
@@ -242,6 +243,7 @@ class RTFObjDataParser {
fileNameToUse = displayName == null ? "" : displayName;
pathToUse = ansiFilePath == null ? "" : ansiFilePath;
}
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileNameToUse);
metadata.set(Metadata.RESOURCE_NAME_KEY, FilenameUtils.getName(fileNameToUse));
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
index c80c94a..bd8156d 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java
@@ -25,6 +25,7 @@ import java.util.List;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.junit.Test;
public class AppleSingleFileParserTest extends TikaTest {
@@ -36,5 +37,7 @@ public class AppleSingleFileParserTest extends TikaTest {
assertContains(AppleSingleFileParser.class.getName(),
Arrays.asList(list.get(0).getValues("X-Parsed-By")));
assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
+ assertEquals("fltsyllabussortie2rev1_2.pdf", list.get(1).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 138120e..9d9d372 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -16,13 +16,15 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
-import java.io.InputStream;
-import java.util.Locale;
-
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
@@ -492,5 +494,13 @@ public class WordParserTest extends TikaTest {
assertEquals("manager1", managers[0]);
assertEquals("manager2", managers[1]);
}
+
+ @Test
+ public void testOrigLocation() throws Exception {
+ Metadata metadata = getXML("testException2.doc").metadata;
+ List<String> values = Arrays.asList(metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ assertContains("C:\\Lab Documents\\Lab Manuals\\Physics 275-6\\276-s00\\07-Force-on-a-current-S00.doc", values);
+ assertContains("Hard Drive:Course Folders:276:276-s00:07-Force-on-a-current-S00", values);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
index 04530ce..510cd32 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/microsoft/xml/XML2003ParserTest.java
@@ -17,7 +17,6 @@
package org.apache.tika.parser.microsoft.xml;
import org.apache.tika.TikaTest;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -26,11 +25,11 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.junit.Test;
+import static org.junit.Assert.assertEquals;
+
import java.util.Arrays;
import java.util.List;
-import static org.junit.Assert.assertEquals;
-
public class XML2003ParserTest extends TikaTest {
@Test
@@ -80,6 +79,8 @@ public class XML2003ParserTest extends TikaTest {
assertContains("R1 c1 R1 c2", txt);
assertNotContained("footnoteFigure", txt);
assertContains("footnote Figure", txt);
+
+ assertEquals("testJPEG_EXIF.jpg", list.get(7).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
assertEquals("image/jpeg", list.get(7).get(Metadata.CONTENT_TYPE));
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index dc75be5..d80842b 100644
--- a/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parser-modules/tika-parser-office-module/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -18,13 +18,13 @@ package org.apache.tika.parser.rtf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.InputStream;
-import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FilenameUtils;
@@ -381,83 +381,45 @@ public class RTFParserTest extends TikaTest {
// TIKA-1010
@Test
public void testEmbeddedMonster() throws Exception {
- Set<MediaType> skipTypes = new HashSet<MediaType>();
- skipTypes.add(MediaType.parse("application/x-emf"));
- skipTypes.add(MediaType.parse("application/x-msmetafile"));
-
-
- List<String> trueNames = new ArrayList<String>();
- trueNames.add("file_0.doc");
- trueNames.add("Hw.txt");
- trueNames.add("file_1.xlsx");
- trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip");
- trueNames.add("html-within-zip.zip");
- trueNames.add("text.html");
- trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html");
- trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
- trueNames.add("file_2.xls");
- trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg");
- trueNames.add("file_3.pdf");
- trueNames.add("file_4.ppt");
- trueNames.add("file_5.pptx");
- trueNames.add("thumbnail.jpeg");
- trueNames.add("file_6.doc");
- trueNames.add("file_7.doc");
- trueNames.add("file_8.docx");
- trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
-
- List<String> trueTypes = new ArrayList<String>();
- trueTypes.add("application/msword");
- trueTypes.add("text/plain");
- trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- trueTypes.add("application/zip");
- trueTypes.add("application/zip");
- trueTypes.add("text/html");
- trueTypes.add("text/html");
- trueTypes.add("image/jpeg");
- trueTypes.add("application/vnd.ms-excel");
- trueTypes.add("application/vnd.ms-outlook");
- trueTypes.add("application/pdf");
- trueTypes.add("application/vnd.ms-powerpoint");
- trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
- trueTypes.add("image/jpeg");
- trueTypes.add("application/msword");
- trueTypes.add("application/msword");
- trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
- trueTypes.add("image/jpeg");
-
- TrackingHandler tracker = new TrackingHandler(skipTypes);
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
- }
- assertEquals(trueNames.size(), tracker.filenames.size());
- assertEquals(trueTypes.size(), tracker.mediaTypes.size());
- for (int i = 0; i < tracker.filenames.size(); i++) {
- String expectedName = trueNames.get(i);
- if (expectedName == null) {
- assertNull(tracker.filenames.get(i));
- } else {
- assertNotNull(tracker.filenames.get(i));
- //necessary to getName() because MSOffice extractor includes
- //directory: _1457338524/HW.txt
- assertEquals("filename equals ",
- expectedName, FilenameUtils.getName(tracker.filenames.get(i)));
- }
- assertEquals(trueTypes.get(i), tracker.mediaTypes.get(i).toString());
- }
-
- tracker = new TrackingHandler();
- try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"))) {
- ContainerExtractor ex = new ParserContainerExtractor();
- assertEquals(true, ex.isSupported(tis));
- ex.extract(tis, ex, tracker);
+ Map<Integer, Pair> expected = new HashMap<>();
+ expected.put(2, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
+ expected.put(3, new Pair("file_0.doc", "application/msword"));
+ expected.put(6, new Pair("file_1.xlsx",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
+ expected.put(9, new Pair("text.html", "text/html; charset=windows-1252"));
+ expected.put(10, new Pair("html-within-zip.zip", "application/zip"));
+ expected.put(11, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
+ expected.put(14, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
+ expected.put(17, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+ expected.put(20, new Pair("file_2.xls", "application/vnd.ms-excel"));
+ expected.put(23, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
+ expected.put(26, new Pair("file_3.pdf", "application/pdf"));
+ expected.put(29, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
+ expected.put(33, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
+ expected.put(32, new Pair("thumbnail.jpeg", "image/jpeg"));
+ expected.put(36, new Pair("file_6.doc", "application/msword"));
+ expected.put(39, new Pair("file_7.doc", "application/msword"));
+ expected.put(42, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
+ expected.put(45, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
+
+
+ List<Metadata> metadataList = getRecursiveJson("testRTFEmbeddedFiles.rtf");
+ assertEquals(48, metadataList.size());
+ for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
+ Metadata metadata = metadataList.get(e.getKey());
+ Pair p = e.getValue();
+ assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
+ //necessary to getName() because MSOffice extractor includes
+ //directory: _1457338524/HW.txt
+ assertEquals("filename equals ",
+ p.fileName, FilenameUtils.getName(
+ metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+
+ assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
}
- assertEquals(47, tracker.filenames.size());
- assertEquals("thumbnail_26.emf", tracker.filenames.get(45));
- assertEquals("thumbnail_27.wmf", tracker.filenames.get(46));
+ assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_\u666e\u6797\u65af\u987f.jpg",
+ metadataList.get(45).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
}
//TIKA-1010 test regular (not "embedded") images/picts
@@ -537,4 +499,12 @@ public class RTFParserTest extends TikaTest {
assertEquals(2, tracker.filenames.size());
}
+ private static class Pair {
+ final String fileName;
+ final String mimeType;
+ Pair(String fileName, String mimeType) {
+ this.fileName = fileName;
+ this.mimeType = mimeType;
+ }
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 9a73bde..832b06e 100644
--- a/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parser-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -33,6 +33,8 @@ import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
+import static org.apache.tika.parser.pdf.PDFParserConfig.OCR_STRATEGY.NO_OCR;
+
import javax.xml.stream.XMLStreamException;
import org.apache.commons.io.IOExceptionWithCause;
import org.apache.commons.io.IOUtils;
@@ -176,7 +178,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- private void extractMultiOSPDEmbeddedFiles(String defaultName,
+ private void extractMultiOSPDEmbeddedFiles(String displayName,
PDComplexFileSpecification spec,
EmbeddedDocumentExtractor extractor) throws IOException,
SAXException, TikaException {
@@ -185,13 +187,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return;
}
//current strategy is to pull all, not just first non-null
- extractPDEmbeddedFile(defaultName, spec.getFile(), spec.getEmbeddedFile(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
- extractPDEmbeddedFile(defaultName, spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFile(), spec.getEmbeddedFile(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileMac(), spec.getEmbeddedFileMac(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileDos(), spec.getEmbeddedFileDos(), extractor);
+ extractPDEmbeddedFile(displayName, spec.getFileUnicode(), spec.getFileUnix(), spec.getEmbeddedFileUnix(), extractor);
}
- private void extractPDEmbeddedFile(String defaultName, String fileName, PDEmbeddedFile file,
+ private void extractPDEmbeddedFile(String displayName, String unicodeFileName,
+ String fileName, PDEmbeddedFile file,
EmbeddedDocumentExtractor extractor)
throws SAXException, IOException, TikaException {
@@ -199,8 +202,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//skip silently
return;
}
-
- fileName = (fileName == null) ? defaultName : fileName;
+
+ fileName = (fileName == null) ? displayName : fileName;
// TODO: other metadata?
Metadata metadata = new Metadata();
@@ -209,6 +212,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
+ metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);
if (extractor.shouldParseEmbedded(metadata)) {
TikaInputStream stream = null;
@@ -289,7 +293,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
try {
- extractMultiOSPDEmbeddedFiles("", fileSpec, extractor);
+ extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
} catch (SAXException e) {
throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
} catch (TikaException e) {
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf
index a385313..a407ded 100644
Binary files a/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf and b/tika-test-resources/src/test/resources/test-documents/testAppleSingleFile.pdf differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls
new file mode 100644
index 0000000..c38f64c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xls differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx
new file mode 100644
index 0000000..9c0d2b9
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testExcel_embeddedPDF.xlsx differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt
new file mode 100644
index 0000000..3129be1
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.ppt differ
http://git-wip-us.apache.org/repos/asf/tika/blob/e62f2305/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
----------------------------------------------------------------------
diff --git a/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx
new file mode 100644
index 0000000..a96aa3c
Binary files /dev/null and b/tika-test-resources/src/test/resources/test-documents/testPPT_EmbeddedPDF.pptx differ