You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/09/02 17:16:45 UTC
svn commit: r1164578 - in /tika/trunk:
tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Author: jukka
Date: Fri Sep 2 15:16:44 2011
New Revision: 1164578
URL: http://svn.apache.org/viewvc?rev=1164578&view=rev
Log:
TIKA-704: PDF and Outlook docs embedded in MS Word documents not parsed
Better handling of resources embedded as OLE objects
Also better type detection and filename generation for the TikaCLI --extract option
Modified:
tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
Modified: tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=1164578&r1=1164577&r2=1164578&view=diff
==============================================================================
--- tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java (original)
+++ tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java Fri Sep 2 15:16:44 2011
@@ -591,7 +591,9 @@ public class TikaCLI {
return handler;
}
- private static class FileEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
+ private class FileEmbeddedDocumentExtractor
+ implements EmbeddedDocumentExtractor {
+
private int count = 0;
private final TikaConfig config = TikaConfig.getDefaultConfig();
@@ -603,14 +605,15 @@ public class TikaCLI {
String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
if (name == null) {
- name = Integer.toString(count);
+ name = "file" + count++;
}
- String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ MediaType contentType = detector.detect(inputStream, metadata);
if (name.indexOf('.')==-1 && contentType!=null) {
try {
- name += config.getMimeRepository().forName(contentType).getExtension();
+ name += config.getMimeRepository().forName(
+ contentType.toString()).getExtension();
} catch (MimeTypeException e) {
e.printStackTrace();
}
@@ -629,8 +632,6 @@ public class TikaCLI {
IOUtils.copy(inputStream, os);
os.close();
-
- count++;
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java?rev=1164578&r1=1164577&r2=1164578&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java Fri Sep 2 15:16:44 2011
@@ -16,18 +16,22 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.net.URI;
import java.util.List;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.opc.PackagePart;
-import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
-import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
@@ -53,7 +57,10 @@ public abstract class AbstractOOXMLExtra
static final String RELATION_IMAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image";
static final String RELATION_OLE_OBJECT = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/oleObject";
static final String RELATION_PACKAGE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/package";
-
+
+ private static final String TYPE_OLE_OBJECT =
+ "application/vnd.openxmlformats-officedocument.oleObject";
+
protected POIXMLTextExtractor extractor;
private final EmbeddedDocumentExtractor embeddedExtractor;
@@ -92,70 +99,109 @@ public abstract class AbstractOOXMLExtra
* @see org.apache.tika.parser.microsoft.ooxml.OOXMLExtractor#getXHTML(org.xml.sax.ContentHandler,
* org.apache.tika.metadata.Metadata)
*/
- public void getXHTML(ContentHandler handler, Metadata metadata, ParseContext context)
+ public void getXHTML(
+ ContentHandler handler, Metadata metadata, ParseContext context)
throws SAXException, XmlException, IOException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
+
buildXHTML(xhtml);
- xhtml.endDocument();
-
+
// Now do any embedded parts
- List<PackagePart> mainParts = getMainDocumentParts();
- for(PackagePart part : mainParts) {
- PackageRelationshipCollection rels;
- try {
- rels = part.getRelationships();
- } catch(InvalidFormatException e) {
- throw new TikaException("Corrupt OOXML file", e);
- }
-
- for(PackageRelationship rel : rels) {
- // Is it an embedded type (not part of the document)
- if( rel.getRelationshipType().equals(RELATION_AUDIO) ||
- rel.getRelationshipType().equals(RELATION_IMAGE) ||
- rel.getRelationshipType().equals(RELATION_OLE_OBJECT) ||
- rel.getRelationshipType().equals(RELATION_PACKAGE) ) {
- if(rel.getTargetMode() == TargetMode.INTERNAL) {
- PackagePartName relName;
- try {
- relName = PackagingURIHelper.createPartName(rel.getTargetURI());
- } catch(InvalidFormatException e) {
- throw new TikaException("Broken OOXML file", e);
+ handleEmbeddedParts(handler);
+
+ xhtml.endDocument();
+ }
+
+ private void handleEmbeddedParts(ContentHandler handler)
+ throws TikaException, IOException, SAXException {
+ try {
+ for (PackagePart source : getMainDocumentParts()) {
+ for (PackageRelationship rel : source.getRelationships()) {
+ if (rel.getTargetMode() == TargetMode.INTERNAL) {
+ URI uri = rel.getTargetURI();
+ PackagePart target = rel.getPackage().getPart(
+ PackagingURIHelper.createPartName(uri));
+
+ String type = rel.getRelationshipType();
+ if (RELATION_OLE_OBJECT.equals(type)
+ && TYPE_OLE_OBJECT.equals(target.getContentType())) {
+ handleEmbeddedOLE(target, handler);
+ } else if (RELATION_AUDIO.equals(type)
+ || RELATION_IMAGE.equals(type)
+ || RELATION_PACKAGE.equals(type)
+ || RELATION_OLE_OBJECT.equals(type)) {
+ handleEmbeddedFile(target, handler);
+ }
}
- PackagePart relPart = rel.getPackage().getPart(relName);
- handleEmbedded(rel, relPart, handler, context);
- }
- }
- }
+ }
+ }
+ } catch (InvalidFormatException e) {
+ throw new TikaException("Broken OOXML file", e);
}
}
-
+
/**
- * Handles an embedded resource in the file
+ * Handles an embedded OLE object in the document
*/
- protected void handleEmbedded(PackageRelationship rel, PackagePart part,
- ContentHandler handler, ParseContext context)
- throws SAXException, XmlException, IOException, TikaException {
- // Get the name
- String name = rel.getTargetURI().toString();
- if(name.indexOf('/') > -1) {
- name = name.substring(name.lastIndexOf('/')+1);
- }
-
- // Get the content type
- String type = part.getContentType();
-
- // Call the recursing handler
- Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
- metadata.set(Metadata.CONTENT_TYPE, type);
-
- if (embeddedExtractor.shouldParseEmbedded(metadata)) {
- embeddedExtractor.parseEmbedded(
- TikaInputStream.get(part.getInputStream()),
- new EmbeddedContentHandler(handler),
- metadata, false);
- }
+ private void handleEmbeddedOLE(PackagePart part, ContentHandler handler)
+ throws IOException, SAXException {
+ POIFSFileSystem fs = new POIFSFileSystem(part.getInputStream());
+ try {
+ Metadata metadata = new Metadata();
+ TikaInputStream stream = null;
+
+ DirectoryNode root = fs.getRoot();
+ if (root.hasEntry("CONTENTS")) {
+ stream = TikaInputStream.get(
+ fs.createDocumentInputStream("CONTENTS"));
+ } else if (root.hasEntry("\u0001Ole10Native")) {
+ Ole10Native ole =
+ Ole10Native.createFromEmbeddedOleObject(fs);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, ole.getLabel());
+ byte[] data = ole.getDataBuffer();
+ if (data != null) {
+ stream = TikaInputStream.get(data);
+ }
+ }
+
+ if (stream != null
+ && embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ stream, new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
+ } catch (FileNotFoundException e) {
+ // There was no CONTENTS entry, so skip this part
+ } catch (Ole10NativeException e) {
+ // Could not process an OLE 1.0 entry, so skip this part
+ }
+ }
+
+ /**
+ * Handles an embedded file in the document
+ */
+ protected void handleEmbeddedFile(PackagePart part, ContentHandler handler)
+ throws SAXException, IOException {
+ Metadata metadata = new Metadata();
+
+ // Get the name
+ String name = part.getPartName().getName();
+ metadata.set(
+ Metadata.RESOURCE_NAME_KEY,
+ name.substring(name.lastIndexOf('/') + 1));
+
+ // Get the content type
+ metadata.set(
+ Metadata.CONTENT_TYPE, part.getContentType());
+
+ // Call the recursing handler
+ if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+ embeddedExtractor.parseEmbedded(
+ TikaInputStream.get(part.getInputStream()),
+ new EmbeddedContentHandler(handler),
+ metadata, false);
+ }
}
/**