You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2009/07/18 01:03:25 UTC
svn commit: r795266 -
/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Author: jukka
Date: Fri Jul 17 23:03:25 2009
New Revision: 795266
URL: http://svn.apache.org/viewvc?rev=795266&view=rev
Log:
TIKA-262: ParsingReader does not parse metadata for larger MS Office documents
Changes based on patch contributed by Daan de Wit.
Modified:
lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Modified: lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=795266&r1=795265&r2=795266&view=diff
==============================================================================
--- lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ lucene/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Fri Jul 17 23:03:25 2009
@@ -16,6 +16,7 @@
*/
package org.apache.tika.parser.microsoft;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.Date;
@@ -62,17 +63,22 @@
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
- boolean outlookExtracted = false;
POIFSFileSystem filesystem = new POIFSFileSystem(stream);
+
+ // Parse summary entries first, to make metadata available early
+ parseSummaryEntryIfExists(
+ filesystem, SUMMARY_INFORMATION, metadata);
+ parseSummaryEntryIfExists(
+ filesystem, DOCUMENT_SUMMARY_INFORMATION, metadata);
+
+ // Parse remaining document entries
+ boolean outlookExtracted = false;
Iterator<?> entries = filesystem.getRoot().getEntries();
while (entries.hasNext()) {
Entry entry = (Entry) entries.next();
String name = entry.getName();
if (!(entry instanceof DocumentEntry)) {
// Skip directory entries
- } else if (SUMMARY_INFORMATION.equals(name)
- || DOCUMENT_SUMMARY_INFORMATION.equals(name)) {
- parse((DocumentEntry) entry, metadata);
} else if ("WordDocument".equals(name)) {
setType(metadata, "application/msword");
WordExtractor extractor = new WordExtractor(filesystem);
@@ -107,9 +113,12 @@
xhtml.endDocument();
}
- public void parse(DocumentEntry entry, Metadata metadata)
+ private void parseSummaryEntryIfExists(
+ POIFSFileSystem filesystem, String entryName, Metadata metadata)
throws IOException, TikaException {
try {
+ DocumentEntry entry =
+ (DocumentEntry) filesystem.getRoot().getEntry(entryName);
PropertySet properties =
new PropertySet(new DocumentInputStream(entry));
if (properties.isSummaryInformation()) {
@@ -118,6 +127,8 @@
if (properties.isDocumentSummaryInformation()) {
parse(new DocumentSummaryInformation(properties), metadata);
}
+ } catch (FileNotFoundException e) {
+ // entry does not exist, just skip it
} catch (NoPropertySetStreamException e) {
throw new TikaException("Not a HPSF document", e);
} catch (UnexpectedPropertySetTypeException e) {