You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/12 21:18:46 UTC
svn commit: r1182534 - in
/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft:
AbstractPOIFSExtractor.java ExcelExtractor.java OutlookExtractor.java
WordExtractor.java
Author: mikemccand
Date: Wed Oct 12 19:18:45 2011
New Revision: 1182534
URL: http://svn.apache.org/viewvc?rev=1182534&view=rev
Log:
TIKA-751: some initial improvements to embedded office doc handling in AbstractPOIFSExtractor
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Oct 12 19:18:45 2011
@@ -34,6 +34,7 @@ import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -81,7 +82,7 @@ abstract class AbstractPOIFSExtractor {
/**
* Handle an office document that's embedded at the POIFS level
*/
- protected void handleEmbededOfficeDoc(
+ protected void handleEmbeddedOfficeDoc(
DirectoryEntry dir, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
// Is it an embedded OLE2 document, or an embedded OOXML document?
@@ -103,56 +104,60 @@ abstract class AbstractPOIFSExtractor {
// It's regular OLE2
}
- // Need to dump the directory out to a new temp file, so
- // it's stand along
- POIFSFileSystem newFS = new POIFSFileSystem();
- copy(dir, newFS.getRoot());
+ // What kind of document is it?
+ Metadata metadata = new Metadata();
+ POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+ TikaInputStream embedded = null;
- File tmpFile = File.createTempFile("tika", ".ole2");
- try {
- FileOutputStream out = new FileOutputStream(tmpFile);
- newFS.writeFilesystem(out);
- out.close();
-
- // What kind of document is it?
- Metadata metadata = new Metadata();
- POIFSDocumentType type = POIFSDocumentType.detectType(dir);
-
- TikaInputStream embedded;
-
- if (type==POIFSDocumentType.OLE10_NATIVE) {
- Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
- byte[] data = bos.toByteArray();
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
+ byte[] data = bos.toByteArray();
- try {
+ try {
Ole10Native ole = new Ole10Native(data, 0);
byte[] dataBuffer = ole.getDataBuffer();
metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());
embedded = TikaInputStream.get(dataBuffer);
- } catch (Ole10NativeException ex) {
- embedded = TikaInputStream.get(data);
- }
- } else {
- metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
-
- embedded = TikaInputStream.get(tmpFile);
- }
+ } catch (Ole10NativeException ex) {
+ embedded = TikaInputStream.get(data);
+ }
+ tmp.addResource(embedded);
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+ }
- try {
- if (extractor.shouldParseEmbedded(metadata)) {
- extractor.parseEmbedded(embedded, xhtml, metadata, true);
- }
- } finally {
- embedded.close();
- }
- } finally {
- tmpFile.delete();
- }
+ // Should we parse it?
+ if (extractor.shouldParseEmbedded(metadata)) {
+ if (embedded == null) {
+ // Need to dump the directory out to a new temp file, so
+ // it's stand alone
+
+ // TODO: can/should we use NPOIFileSystem here?
+ POIFSFileSystem newFS = new POIFSFileSystem();
+ copy(dir, newFS.getRoot());
+ File tmpFile = tmp.createTemporaryFile();
+ FileOutputStream out = new FileOutputStream(tmpFile);
+ try {
+ newFS.writeFilesystem(out);
+ } finally {
+ out.close();
+ }
+
+ embedded = TikaInputStream.get(tmpFile);
+ tmp.addResource(embedded);
+ }
+ extractor.parseEmbedded(embedded, xhtml, metadata, true);
+ }
+ } finally {
+ tmp.dispose();
+ }
}
protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
@@ -164,8 +169,12 @@ abstract class AbstractPOIFSExtractor {
copy((DirectoryEntry)entry, newDir);
} else {
// Copy entry
- InputStream contents = new DocumentInputStream((DocumentEntry)entry);
- destDir.createDocument(entry.getName(), contents);
+ InputStream contents = new DocumentInputStream((DocumentEntry)entry);
+ try {
+ destDir.createDocument(entry.getName(), contents);
+ } finally {
+ contents.close();
+ }
}
}
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Oct 12 19:18:45 2011
@@ -141,7 +141,7 @@ public class ExcelExtractor extends Abst
if (entry.getName().startsWith("MBD")
&& entry instanceof DirectoryEntry) {
try {
- handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
+ handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
} catch (TikaException e) {
// ignore parse errors from embedded documents
}
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Oct 12 19:18:45 2011
@@ -231,7 +231,7 @@ public class OutlookExtractor extends Ab
);
}
if(attachment.attachmentDirectory != null) {
- handleEmbededOfficeDoc(
+ handleEmbeddedOfficeDoc(
attachment.attachmentDirectory.getDirectory(),
xhtml
);
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Oct 12 19:18:45 2011
@@ -50,7 +50,6 @@ import org.xml.sax.helpers.AttributesImp
public class WordExtractor extends AbstractPOIFSExtractor {
- private static final char RECORD_SEPARATOR = 30;
private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
@@ -121,7 +120,7 @@ public class WordExtractor extends Abstr
for (Entry entry : op) {
if (entry.getName().startsWith("_")
&& entry instanceof DirectoryEntry) {
- handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
+ handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
}
}
} catch(FileNotFoundException e) {