You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/11/28 14:57:09 UTC
svn commit: r1207196 - in /tika/trunk:
tika-core/src/main/resources/org/apache/tika/mime/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
Author: nick
Date: Mon Nov 28 13:57:08 2011
New Revision: 1207196
URL: http://svn.apache.org/viewvc?rev=1207196&view=rev
Log:
TIKA-790 Remove the duplicated detection code between OfficeParser and POIFSContainerDetector, by following the pattern from TIKA-791 and adding a type for OLE10Native, then pushing the rest of the detection work to POIFSContainerDetector
Modified:
tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Modified: tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml?rev=1207196&r1=1207195&r2=1207196&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml (original)
+++ tika/trunk/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml Mon Nov 28 13:57:08 2011
@@ -2761,6 +2761,11 @@
</magic>
</mime-type>
+ <mime-type type="application/x-tika-msoffice-embedded">
+ <sub-class-of type="application/x-tika-msoffice"/>
+ <_comment>OLE10 Native Embedded Document</_comment>
+ </mime-type>
+
<!-- =================================================================== -->
<!-- Office Open XML file formats -->
<!-- http://www.ecma-international.org/publications/standards/Ecma-376.htm -->
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1207196&r1=1207195&r2=1207196&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Nov 28 13:57:08 2011
@@ -21,10 +21,8 @@ import java.io.InputStream;
import java.security.GeneralSecurityException;
import java.util.Arrays;
import java.util.Collections;
-import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
-import java.util.Map;
import java.util.Set;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
@@ -77,7 +75,7 @@ public class OfficeParser extends Abstra
public enum POIFSDocumentType {
WORKBOOK("xls", MediaType.application("vnd.ms-excel")),
- OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice")),
+ OLE10_NATIVE("ole", MediaType.application("x-tika-msoffice-embedded")),
WORDDOCUMENT("doc", MediaType.application("msword")),
UNKNOWN("unknown", MediaType.application("x-tika-msoffice")),
ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")),
@@ -113,39 +111,15 @@ public class OfficeParser extends Abstra
}
public static POIFSDocumentType detectType(DirectoryEntry node) {
+ Set<String> names = new HashSet<String>();
for (Entry entry : node) {
- POIFSDocumentType type = detectType(entry);
- if (type!=UNKNOWN) {
- return type;
- }
+ names.add(entry.getName());
}
- return UNKNOWN;
- }
-
- // TODO Avoid this duplication with POIFSContainerDetector (TIKA-790)
- private final static Map<String,POIFSDocumentType> typeMap = new HashMap<String,POIFSDocumentType>();
- static {
- typeMap.put("Workbook", WORKBOOK);
- typeMap.put("EncryptedPackage", ENCRYPTED);
- typeMap.put("WordDocument", WORDDOCUMENT);
- typeMap.put("Quill", PUBLISHER);
- typeMap.put("PowerPoint Document", POWERPOINT);
- typeMap.put("VisioDocument", VISIO);
- typeMap.put("CONTENTS", WORKS);
- typeMap.put("\u0001Ole10Native", POIFSDocumentType.OLE10_NATIVE);
- typeMap.put("Props", PROJECT); // Project 8
- typeMap.put("Props9", PROJECT); // Project 9, 10, 11
- typeMap.put("Props12", PROJECT); // Project 12+
- }
-
- public static POIFSDocumentType detectType(Entry entry) {
- String name = entry.getName();
- POIFSDocumentType type = typeMap.get(name);
- if (type != null) {
- return type;
- }
- if (entry.getName().startsWith("__substg1.0_")) {
- return OUTLOOK;
+ MediaType type = POIFSContainerDetector.detect(names);
+ for (POIFSDocumentType poifsType : values()) {
+ if (type.equals(poifsType.type)) {
+ return poifsType;
+ }
}
return UNKNOWN;
}
@@ -193,71 +167,64 @@ public class OfficeParser extends Abstra
new SummaryExtractor(metadata).parseSummaries(root);
// Parse remaining document entries
- boolean outlookExtracted = false;
- for (Entry entry : root) {
- POIFSDocumentType type = POIFSDocumentType.detectType(entry);
+ POIFSDocumentType type = POIFSDocumentType.detectType(root);
- if (type!=POIFSDocumentType.UNKNOWN) {
- setType(metadata, type.getType());
- }
+ if (type!=POIFSDocumentType.UNKNOWN) {
+ setType(metadata, type.getType());
+ }
- switch (type) {
- case PUBLISHER:
- PublisherTextExtractor publisherTextExtractor =
- new PublisherTextExtractor(root);
- xhtml.element("p", publisherTextExtractor.getText());
- break;
- case WORDDOCUMENT:
- new WordExtractor(context).parse(root, xhtml);
- break;
- case POWERPOINT:
- new HSLFExtractor(context).parse(root, xhtml);
- break;
- case WORKBOOK:
- Locale locale = context.get(Locale.class, Locale.getDefault());
- new ExcelExtractor(context).parse(root, xhtml, locale);
- break;
- case PROJECT:
- // We currently can't do anything beyond the metadata
- break;
- case VISIO:
- VisioTextExtractor visioTextExtractor =
- new VisioTextExtractor(root);
- for (String text : visioTextExtractor.getAllText()) {
- xhtml.element("p", text);
- }
- break;
- case OUTLOOK:
- if (!outlookExtracted) {
- outlookExtracted = true;
-
- OutlookExtractor extractor =
- new OutlookExtractor(root, context);
-
- extractor.parse(xhtml, metadata);
- }
- break;
- case ENCRYPTED:
- EncryptionInfo info = new EncryptionInfo(root);
- Decryptor d = Decryptor.getInstance(info);
-
- try {
- // TODO Allow the user to specify the password via the ParseContext
- if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
- throw new EncryptedDocumentException();
- }
-
- // Decrypt the OLE2 stream, and delegate the resulting OOXML
- // file to the regular OOXML parser for normal handling
- OOXMLParser parser = new OOXMLParser();
-
- parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
- new BodyContentHandler(xhtml)),
- metadata, context);
- } catch (GeneralSecurityException ex) {
- throw new EncryptedDocumentException(ex);
- }
- }
+ switch (type) {
+ case PUBLISHER:
+ PublisherTextExtractor publisherTextExtractor =
+ new PublisherTextExtractor(root);
+ xhtml.element("p", publisherTextExtractor.getText());
+ break;
+ case WORDDOCUMENT:
+ new WordExtractor(context).parse(root, xhtml);
+ break;
+ case POWERPOINT:
+ new HSLFExtractor(context).parse(root, xhtml);
+ break;
+ case WORKBOOK:
+ Locale locale = context.get(Locale.class, Locale.getDefault());
+ new ExcelExtractor(context).parse(root, xhtml, locale);
+ break;
+ case PROJECT:
+ // We currently can't do anything beyond the metadata
+ break;
+ case VISIO:
+ VisioTextExtractor visioTextExtractor =
+ new VisioTextExtractor(root);
+ for (String text : visioTextExtractor.getAllText()) {
+ xhtml.element("p", text);
+ }
+ break;
+ case OUTLOOK:
+ OutlookExtractor extractor =
+ new OutlookExtractor(root, context);
+
+ extractor.parse(xhtml, metadata);
+ break;
+ case ENCRYPTED:
+ EncryptionInfo info = new EncryptionInfo(root);
+ Decryptor d = Decryptor.getInstance(info);
+
+ try {
+ // TODO Allow the user to specify the password via the ParseContext
+ if (!d.verifyPassword(Decryptor.DEFAULT_PASSWORD)) {
+ throw new EncryptedDocumentException();
+ }
+
+ // Decrypt the OLE2 stream, and delegate the resulting OOXML
+ // file to the regular OOXML parser for normal handling
+ OOXMLParser parser = new OOXMLParser();
+
+ parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
+ new BodyContentHandler(xhtml)),
+ metadata, context);
+ } catch (GeneralSecurityException ex) {
+ throw new EncryptedDocumentException(ex);
+ }
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1207196&r1=1207195&r2=1207196&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Nov 28 13:57:08 2011
@@ -50,6 +50,9 @@ public class POIFSContainerDetector impl
/** The protected OOXML base file format */
public static final MediaType OOXML_PROTECTED = application("x-tika-ooxml-protected");
+
+ /** An OLE10 Native embedded document within another OLE2 document */
+ public static final MediaType OLE10_NATIVE = application("x-tika-msoffice-embedded");
/** Microsoft Excel */
public static final MediaType XLS = application("vnd.ms-excel");
@@ -119,7 +122,16 @@ public class POIFSContainerDetector impl
// Look for known top level entry names to detect the document type
names = getTopLevelNames(tis);
}
-
+
+ // Detect based on the names (as available)
+ return detect(names);
+ }
+
+ /**
+ * Internal detection of the specific kind of OLE2 document, based on the
+ * names of the top level streams within the file.
+ */
+ protected static MediaType detect(Set<String> names) {
if (names != null) {
if (names.contains("Workbook")) {
return XLS;
@@ -142,6 +154,8 @@ public class POIFSContainerDetector impl
return PPT;
} else if (names.contains("VisioDocument")) {
return VSD;
+ } else if (names.contains("\u0001Ole10Native")) {
+ return OLE10_NATIVE;
} else if (names.contains("CONTENTS") && names.contains("SPELLING")) {
// Newer Works files
return WPS;
@@ -161,8 +175,6 @@ public class POIFSContainerDetector impl
return MPP;
}
}
- } else if (names.contains("\u0001Ole10Native")) {
- return OLE;
} else if (names.contains("PerfectOffice_MAIN")) {
if (names.contains("SlideShow")) {
return MediaType.application("x-corelpresentations"); // .shw