You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/17 17:00:22 UTC
svn commit: r1185234 - in /tika/trunk: ./
tika-core/src/main/java/org/apache/tika/extractor/
tika-core/src/main/java/org/apache/tika/io/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
Author: mikemccand
Date: Mon Oct 17 15:00:22 2011
New Revision: 1185234
URL: http://svn.apache.org/viewvc?rev=1185234&view=rev
Log:
TIKA-753: speed up processing of embedded office docs
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Oct 17 15:00:22 2011
@@ -15,6 +15,8 @@ Release 0.11 - Current Development
* TIKA-742: Paragraphs are now extracted within each page of a PDF
document.
+ * TIKA-753: Improve performance when extracting embedded office docs.
+
Release 0.10 - 09/25/2011
The most notable changes in Tika 0.10 over previous releases are:
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java Mon Oct 17 15:00:22 2011
@@ -16,8 +16,6 @@
*/
package org.apache.tika.extractor;
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
@@ -37,6 +35,8 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
/**
* Helper class for parsers of package archives or other compound document
* formats that support embedded or attached component documents.
@@ -92,11 +92,26 @@ public class ParsingEmbeddedDocumentExtr
// Use the delegate parser to parse this entry
TemporaryResources tmp = new TemporaryResources();
try {
+ final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
+ if (stream instanceof TikaInputStream) {
+ final Object container = ((TikaInputStream) stream).getOpenContainer();
+
+ // TODO: we can't let ZipPackage through,
+ // becase of POI bug 51949. This is less
+ // efficient because the inner parser will
+ // have to re-open the zip archive again.
+ // Once we upgrade to POI 3.8 beta 5 we can
+ // remove this:
+ if ((container != null && !(container.getClass().getSimpleName().equals("ZipPackage")))) {
+ newStream.setOpenContainer(container);
+ }
+ }
DELEGATING_PARSER.parse(
- TikaInputStream.get(new CloseShieldInputStream(stream), tmp),
- new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
+ newStream,
+ new EmbeddedContentHandler(new BodyContentHandler(handler)),
+ metadata, context);
} catch (TikaException e) {
+ // TODO: can we log a warning somehow?
// Could not parse the entry, just skip the content
} finally {
tmp.close();
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Mon Oct 17 15:00:22 2011
@@ -649,12 +649,15 @@ public class TikaInputStream extends Tag
}
public String toString() {
- String str = "TikaInputStream of ";
- if(hasFile()) {
- str += file.toString();
- } else {
- str += in.toString();
- }
- return str;
+ String str = "TikaInputStream of ";
+ if (hasFile()) {
+ str += file.toString();
+ } else {
+ str += in.toString();
+ }
+ if (openContainer != null) {
+ str += " (in " + openContainer + ")";
+ }
+ return str;
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Mon Oct 17 15:00:22 2011
@@ -17,11 +17,7 @@
package org.apache.tika.parser.microsoft;
import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
import java.io.IOException;
-import java.io.InputStream;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -29,12 +25,10 @@ import org.apache.poi.poifs.filesystem.D
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -64,11 +58,11 @@ abstract class AbstractPOIFSExtractor {
try {
Metadata metadata = new Metadata();
if(filename != null) {
- metadata.set(Metadata.TIKA_MIME_FILE, filename);
- metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+ metadata.set(Metadata.TIKA_MIME_FILE, filename);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
}
if(mediaType != null) {
- metadata.set(Metadata.CONTENT_TYPE, mediaType);
+ metadata.set(Metadata.CONTENT_TYPE, mediaType);
}
if (extractor.shouldParseEmbedded(metadata)) {
@@ -85,11 +79,13 @@ abstract class AbstractPOIFSExtractor {
protected void handleEmbeddedOfficeDoc(
DirectoryEntry dir, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+
// Is it an embedded OLE2 document, or an embedded OOXML document?
- try {
+
+ if (dir.hasEntry("Package")) {
+ // It's OOXML (has a ZipFile):
Entry ooxml = dir.getEntry("Package");
- // It's OOXML
TikaInputStream stream = TikaInputStream.get(
new DocumentInputStream((DocumentEntry) ooxml));
try {
@@ -100,34 +96,35 @@ abstract class AbstractPOIFSExtractor {
} finally {
stream.close();
}
- } catch(FileNotFoundException e) {
- // It's regular OLE2
}
+ // It's regular OLE2:
+
// What kind of document is it?
Metadata metadata = new Metadata();
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
TikaInputStream embedded = null;
- TemporaryResources tmp = new TemporaryResources();
try {
if (type == POIFSDocumentType.OLE10_NATIVE) {
Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
+
+ // TODO: once we upgrade to POI 3.8 beta 5
+ // we can avoid this full copy/serialize by
+ // passing the DirectoryNode instead:
IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
byte[] data = bos.toByteArray();
try {
+ // Maybe unwrap OLE10Native record:
Ole10Native ole = new Ole10Native(data, 0);
- byte[] dataBuffer = ole.getDataBuffer();
-
- metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());
-
- embedded = TikaInputStream.get(dataBuffer);
+ data = ole.getDataBuffer();
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
} catch (Ole10NativeException ex) {
- embedded = TikaInputStream.get(data);
+ // Not an OLE10Native record
}
- tmp.addResource(embedded);
+ embedded = TikaInputStream.get(data);
} else {
metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
@@ -136,45 +133,18 @@ abstract class AbstractPOIFSExtractor {
// Should we parse it?
if (extractor.shouldParseEmbedded(metadata)) {
if (embedded == null) {
- // Need to dump the directory out to a new temp file, so
- // it's stand alone
-
- // TODO: can/should we use NPOIFileSystem here?
- POIFSFileSystem newFS = new POIFSFileSystem();
- copy(dir, newFS.getRoot());
- File tmpFile = tmp.createTemporaryFile();
- FileOutputStream out = new FileOutputStream(tmpFile);
- try {
- newFS.writeFilesystem(out);
- } finally {
- out.close();
- }
-
- embedded = TikaInputStream.get(tmpFile);
- tmp.addResource(embedded);
+ // Make a TikaInputStream that just
+ // passes the root directory of the
+ // embedded document, and is otherwise
+ // empty (byte[0]):
+ embedded = TikaInputStream.get(new byte[0]);
+ embedded.setOpenContainer(dir);
}
extractor.parseEmbedded(embedded, xhtml, metadata, true);
}
} finally {
- tmp.dispose();
- }
- }
-
- protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
- throws IOException {
- for (Entry entry : sourceDir) {
- if (entry instanceof DirectoryEntry) {
- // Need to recurse
- DirectoryEntry newDir = destDir.createDirectory(entry.getName());
- copy((DirectoryEntry)entry, newDir);
- } else {
- // Copy entry
- InputStream contents = new DocumentInputStream((DocumentEntry)entry);
- try {
- destDir.createDocument(entry.getName(), contents);
- } finally {
- contents.close();
- }
+ if (embedded != null) {
+ embedded.close();
}
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Oct 17 15:00:22 2011
@@ -57,6 +57,7 @@ import org.apache.poi.hssf.record.TextOb
import org.apache.poi.hssf.record.chart.SeriesTextRecord;
import org.apache.poi.hssf.record.common.UnicodeString;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
@@ -133,11 +134,17 @@ public class ExcelExtractor extends Abst
protected void parse(
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException, TikaException {
+ parse(filesystem.getRoot(), xhtml, locale);
+ }
+
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml,
+ Locale locale) throws IOException, SAXException, TikaException {
TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
- listener.processFile(filesystem, isListenForAllRecords());
+ listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
- for (Entry entry : filesystem.getRoot()) {
+ for (Entry entry : root) {
if (entry.getName().startsWith("MBD")
&& entry instanceof DirectoryEntry) {
try {
@@ -246,6 +253,11 @@ public class ExcelExtractor extends Abst
*/
public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
throws IOException, SAXException, TikaException {
+ processFile(filesystem.getRoot(), listenForAllRecords);
+ }
+
+ public void processFile(DirectoryNode root, boolean listenForAllRecords)
+ throws IOException, SAXException, TikaException {
// Set up listener and register the records we want to process
HSSFRequest hssfRequest = new HSSFRequest();
@@ -272,7 +284,7 @@ public class ExcelExtractor extends Abst
}
// Create event factory and process Workbook (fire events)
- DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+ DocumentInputStream documentInputStream = root.createDocumentInputStream("Workbook");
HSSFEventFactory eventFactory = new HSSFEventFactory();
try {
eventFactory.processEvents(hssfRequest, documentInputStream);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Mon Oct 17 15:00:22 2011
@@ -29,6 +29,7 @@ import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.ObjectData;
import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -44,7 +45,13 @@ public class HSLFExtractor extends Abstr
protected void parse(
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
- HSLFSlideShow ss = new HSLFSlideShow(filesystem.getRoot());
+ parse(filesystem.getRoot(), xhtml);
+ }
+
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HSLFSlideShow ss = new HSLFSlideShow(root);
SlideShow _show = new SlideShow(ss);
Slide[] _slides = _show.getSlides();
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Oct 17 15:00:22 2011
@@ -21,8 +21,10 @@ import java.io.InputStream;
import java.security.GeneralSecurityException;
import java.util.Arrays;
import java.util.Collections;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
+import java.util.Map;
import java.util.Set;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
@@ -30,6 +32,7 @@ import org.apache.poi.hpbf.extractor.Pub
import org.apache.poi.poifs.crypt.Decryptor;
import org.apache.poi.poifs.crypt.EncryptionInfo;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -115,37 +118,27 @@ public class OfficeParser extends Abstra
return UNKNOWN;
}
+ private final static Map<String,POIFSDocumentType> typeMap = new HashMap<String,POIFSDocumentType>();
+ static {
+ typeMap.put("Workbook", WORKBOOK);
+ typeMap.put("EncryptedPackage", ENCRYPTED);
+ typeMap.put("WordDocument", WORDDOCUMENT);
+ typeMap.put("Quill", PUBLISHER);
+ typeMap.put("PowerPoint Document", POWERPOINT);
+ typeMap.put("VisioDocument", VISIO);
+ typeMap.put("CONTENTS", WORKS);
+ typeMap.put("\u0001Ole10Native", POIFSDocumentType.OLE10_NATIVE);
+ }
+
public static POIFSDocumentType detectType(Entry entry) {
String name = entry.getName();
-
- if ("Workbook".equals(name)) {
- return WORKBOOK;
- }
- if ("EncryptedPackage".equals(name)) {
- return ENCRYPTED;
- }
- if ("WordDocument".equals(name)) {
- return WORDDOCUMENT;
+ POIFSDocumentType type = typeMap.get(name);
+ if (type != null) {
+ return type;
}
- if ("Quill".equals(name)) {
- return PUBLISHER;
- }
- if ("PowerPoint Document".equals(entry.getName())) {
- return POWERPOINT;
- }
- if ("VisioDocument".equals(entry.getName())) {
- return VISIO;
- }
- if ("CONTENTS".equals(entry.getName())) {
- return WORKS;
- }
if (entry.getName().startsWith("__substg1.0_")) {
return OUTLOOK;
}
- if ("\u0001Ole10Native".equals(name)) {
- return POIFSDocumentType.OLE10_NATIVE;
- }
-
return UNKNOWN;
}
}
@@ -164,26 +157,36 @@ public class OfficeParser extends Abstra
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- NPOIFSFileSystem filesystem;
+ final DirectoryNode root;
TikaInputStream tstream = TikaInputStream.cast(stream);
if (tstream == null) {
- filesystem =
- new NPOIFSFileSystem(new CloseShieldInputStream(stream));
- } else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) {
- filesystem = (NPOIFSFileSystem) tstream.getOpenContainer();
- } else if (tstream.hasFile()) {
- filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
+ root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
} else {
- filesystem =
- new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+ final Object container = tstream.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ root = ((NPOIFSFileSystem) container).getRoot();
+ } else if (container instanceof DirectoryNode) {
+ root = (DirectoryNode) container;
+ } else if (tstream.hasFile()) {
+ root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
+ } else {
+ root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
+ }
}
+ parse(root, context, metadata, xhtml);
+ xhtml.endDocument();
+ }
+
+ protected void parse(
+ DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
// Parse summary entries first, to make metadata available early
- new SummaryExtractor(metadata).parseSummaries(filesystem);
+ new SummaryExtractor(metadata).parseSummaries(root);
// Parse remaining document entries
boolean outlookExtracted = false;
- for (Entry entry : filesystem.getRoot()) {
+ for (Entry entry : root) {
POIFSDocumentType type = POIFSDocumentType.detectType(entry);
if (type!=POIFSDocumentType.UNKNOWN) {
@@ -193,22 +196,22 @@ public class OfficeParser extends Abstra
switch (type) {
case PUBLISHER:
PublisherTextExtractor publisherTextExtractor =
- new PublisherTextExtractor(filesystem);
+ new PublisherTextExtractor(root);
xhtml.element("p", publisherTextExtractor.getText());
break;
case WORDDOCUMENT:
- new WordExtractor(context).parse(filesystem, xhtml);
+ new WordExtractor(context).parse(root, xhtml);
break;
case POWERPOINT:
- new HSLFExtractor(context).parse(filesystem, xhtml);
+ new HSLFExtractor(context).parse(root, xhtml);
break;
case WORKBOOK:
Locale locale = context.get(Locale.class, Locale.getDefault());
- new ExcelExtractor(context).parse(filesystem, xhtml, locale);
+ new ExcelExtractor(context).parse(root, xhtml, locale);
break;
case VISIO:
VisioTextExtractor visioTextExtractor =
- new VisioTextExtractor(filesystem);
+ new VisioTextExtractor(root);
for (String text : visioTextExtractor.getAllText()) {
xhtml.element("p", text);
}
@@ -218,13 +221,13 @@ public class OfficeParser extends Abstra
outlookExtracted = true;
OutlookExtractor extractor =
- new OutlookExtractor(filesystem, context);
+ new OutlookExtractor(root, context);
extractor.parse(xhtml, metadata);
}
break;
case ENCRYPTED:
- EncryptionInfo info = new EncryptionInfo(filesystem);
+ EncryptionInfo info = new EncryptionInfo(root);
Decryptor d = Decryptor.getInstance(info);
try {
@@ -234,7 +237,7 @@ public class OfficeParser extends Abstra
OOXMLParser parser = new OOXMLParser();
- parser.parse(d.getDataStream(filesystem), new EmbeddedContentHandler(
+ parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
new BodyContentHandler(xhtml)),
metadata, context);
} catch (GeneralSecurityException ex) {
@@ -242,8 +245,6 @@ public class OfficeParser extends Abstra
}
}
}
-
- xhtml.endDocument();
}
private void setType(Metadata metadata, MediaType type) {
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Mon Oct 17 15:00:22 2011
@@ -30,6 +30,7 @@ import org.apache.poi.hsmf.datatypes.MAP
import org.apache.poi.hsmf.datatypes.StringChunk;
import org.apache.poi.hsmf.datatypes.Types;
import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
@@ -51,10 +52,14 @@ public class OutlookExtractor extends Ab
private final MAPIMessage msg;
public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+ this(filesystem.getRoot(), context);
+ }
+
+ public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
super(context);
try {
- this.msg = new MAPIMessage(filesystem.getRoot());
+ this.msg = new MAPIMessage(root);
} catch (IOException e) {
throw new TikaException("Failed to parse Outlook message", e);
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Oct 17 15:00:22 2011
@@ -16,8 +16,6 @@
*/
package org.apache.tika.parser.microsoft;
-import static org.apache.tika.mime.MediaType.application;
-
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.FileChannel;
@@ -25,6 +23,7 @@ import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.detect.Detector;
@@ -32,6 +31,8 @@ import org.apache.tika.io.TikaInputStrea
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import static org.apache.tika.mime.MediaType.application;
+
/**
* A detector that works on a POIFS OLE2 document
* to figure out exactly what the file is.
@@ -76,24 +77,42 @@ public class POIFSContainerDetector impl
return MediaType.OCTET_STREAM;
}
- // Check if the document starts with the OLE header
- input.mark(8);
- try {
- if (input.read() != 0xd0 || input.read() != 0xcf
+ // If this is a TikaInputStream wrapping an already
+ // parsed NPOIFileSystem/DirectoryNode, just get the
+ // names from the root:
+ TikaInputStream tis = TikaInputStream.cast(input);
+ Set<String> names = null;
+ if (tis != null) {
+ Object container = tis.getOpenContainer();
+ if (container instanceof NPOIFSFileSystem) {
+ names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+ } else if (container instanceof DirectoryNode) {
+ names = getTopLevelNames((DirectoryNode) container);
+ }
+ }
+
+ if (names == null) {
+ // Check if the document starts with the OLE header
+ input.mark(8);
+ try {
+ if (input.read() != 0xd0 || input.read() != 0xcf
|| input.read() != 0x11 || input.read() != 0xe0
|| input.read() != 0xa1 || input.read() != 0xb1
|| input.read() != 0x1a || input.read() != 0xe1) {
- return MediaType.OCTET_STREAM;
+ return MediaType.OCTET_STREAM;
+ }
+ } finally {
+ input.reset();
}
- } finally {
- input.reset();
}
// We can only detect the exact type when given a TikaInputStream
- TikaInputStream tis = TikaInputStream.cast(input);
- if (tis != null) {
+ if (names == null && tis != null) {
// Look for known top level entry names to detect the document type
- Set<String> names = getTopLevelNames(tis);
+ names = getTopLevelNames(tis);
+ }
+
+ if (names != null) {
if (names.contains("Workbook")) {
return XLS;
} else if (names.contains("EncryptedPackage")) {
@@ -149,11 +168,7 @@ public class POIFSContainerDetector impl
// a reference to the already opened POI file system
stream.setOpenContainer(fs);
- Set<String> names = new HashSet<String>();
- for (Entry entry : fs.getRoot()) {
- names.add(entry.getName());
- }
- return names;
+ return getTopLevelNames(fs.getRoot());
} catch (IOException e) {
// Parse error in POI, so we don't know the file type
return Collections.emptySet();
@@ -163,4 +178,11 @@ public class POIFSContainerDetector impl
}
}
+ private static Set<String> getTopLevelNames(DirectoryNode root) {
+ Set<String> names = new HashSet<String>();
+ for (Entry entry : root) {
+ names.add(entry.getName());
+ }
+ return names;
+ }
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Mon Oct 17 15:00:22 2011
@@ -27,6 +27,7 @@ import org.apache.poi.hpsf.NoPropertySet
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
@@ -54,16 +55,21 @@ class SummaryExtractor {
public void parseSummaries(NPOIFSFileSystem filesystem)
throws IOException, TikaException {
- parseSummaryEntryIfExists(filesystem, SUMMARY_INFORMATION);
- parseSummaryEntryIfExists(filesystem, DOCUMENT_SUMMARY_INFORMATION);
+ parseSummaries(filesystem.getRoot());
+ }
+
+ public void parseSummaries(DirectoryNode root)
+ throws IOException, TikaException {
+ parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
+ parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
}
private void parseSummaryEntryIfExists(
- NPOIFSFileSystem filesystem, String entryName)
+ DirectoryNode root, String entryName)
throws IOException, TikaException {
try {
DocumentEntry entry =
- (DocumentEntry) filesystem.getRoot().getEntry(entryName);
+ (DocumentEntry) root.getEntry(entryName);
PropertySet properties =
new PropertySet(new DocumentInputStream(entry));
if (properties.isSummaryInformation()) {
@@ -134,7 +140,7 @@ class SummaryExtractor {
* Attempt to parse custom document properties and add to the collection of metadata
* @param customProperties
*/
- private void parse(CustomProperties customProperties){
+ private void parse(CustomProperties customProperties) {
if (customProperties != null) {
for (String name : customProperties.nameSet()) {
// Apply the custom prefix
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Oct 17 15:00:22 2011
@@ -39,6 +39,7 @@ import org.apache.poi.hwpf.usermodel.Tab
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
@@ -65,11 +66,17 @@ public class WordExtractor extends Abstr
protected void parse(
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ parse(filesystem.getRoot(), xhtml);
+ }
+
+ protected void parse(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
HWPFDocument document;
try {
- document = new HWPFDocument(filesystem.getRoot());
+ document = new HWPFDocument(root);
} catch(OldWordFileFormatException e) {
- parseWord6(filesystem, xhtml);
+ parseWord6(root, xhtml);
return;
}
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
@@ -115,8 +122,7 @@ public class WordExtractor extends Abstr
// Handle any embeded office documents
try {
- DirectoryEntry op =
- (DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");
+ DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
for (Entry entry : op) {
if (entry.getName().startsWith("_")
&& entry instanceof DirectoryEntry) {
@@ -418,50 +424,63 @@ public class WordExtractor extends Abstr
protected void parseWord6(
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
- HWPFOldDocument doc = new HWPFOldDocument(filesystem.getRoot());
+ parseWord6(filesystem.getRoot(), xhtml);
+ }
+
+ protected void parseWord6(
+ DirectoryNode root, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HWPFOldDocument doc = new HWPFOldDocument(root);
Word6Extractor extractor = new Word6Extractor(doc);
for(String p : extractor.getParagraphText()) {
xhtml.element("p", p);
}
}
+
+ private static final Map<String,TagAndStyle> fixedParagraphStyles = new HashMap<String,TagAndStyle>();
+ private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
+ static {
+ fixedParagraphStyles.put("Default", defaultParagraphStyle);
+ fixedParagraphStyles.put("Normal", defaultParagraphStyle);
+ fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
+ fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
+ fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
+ fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
+ fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
+ }
/**
* Given a style name, return what tag should be used, and
* what style should be applied to it.
*/
public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
+ TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
+ if (tagAndStyle != null) {
+ return tagAndStyle;
+ }
+
+ if (styleName.equals("Table Contents") && isTable) {
+ return defaultParagraphStyle;
+ }
+
String tag = "p";
String styleClass = null;
-
- if(styleName.equals("Default") || styleName.equals("Normal")) {
- // Already setup
- } else if(styleName.equals("Table Contents") && isTable) {
- // Already setup
- } else if(styleName.equals("heading") || styleName.equals("Heading")) {
- tag = "h1";
- } else if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
- // "Heading 3" or "Heading2" or "heading 4"
- int num = 1;
- try {
- num = Integer.parseInt(
- styleName.substring(styleName.length()-1)
- );
- } catch(NumberFormatException e) {}
- // Turn it into a H1 - H6 (H7+ isn't valid!)
- tag = "h" + Math.min(num, 6);
- } else if(styleName.equals("Title")) {
- tag = "h1";
- styleClass = "title";
- } else if(styleName.equals("Subtitle")) {
- tag = "h2";
- styleClass = "subtitle";
- } else if(styleName.equals("HTML Preformatted")) {
- tag = "pre";
+
+ if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
+ // "Heading 3" or "Heading2" or "heading 4"
+ int num = 1;
+ try {
+ num = Integer.parseInt(
+ styleName.substring(styleName.length()-1)
+ );
+ } catch(NumberFormatException e) {}
+ // Turn it into a H1 - H6 (H7+ isn't valid!)
+ tag = "h" + Math.min(num, 6);
} else {
- styleClass = styleName.replace(' ', '_');
- styleClass = styleClass.substring(0,1).toLowerCase() +
- styleClass.substring(1);
+ styleClass = styleName.replace(' ', '_');
+ styleClass = styleClass.substring(0,1).toLowerCase() +
+ styleClass.substring(1);
}
return new TagAndStyle(tag,styleClass);