You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/12 21:18:46 UTC

svn commit: r1182534 - in /tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft: AbstractPOIFSExtractor.java ExcelExtractor.java OutlookExtractor.java WordExtractor.java

Author: mikemccand
Date: Wed Oct 12 19:18:45 2011
New Revision: 1182534

URL: http://svn.apache.org/viewvc?rev=1182534&view=rev
Log:
TIKA-751: some initial improvements to embedded office doc handling in AbstractPOIFSExtractor

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Wed Oct 12 19:18:45 2011
@@ -34,6 +34,7 @@ import org.apache.poi.util.IOUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -81,7 +82,7 @@ abstract class AbstractPOIFSExtractor {
     /**
      * Handle an office document that's embedded at the POIFS level
      */
-    protected void handleEmbededOfficeDoc(
+    protected void handleEmbeddedOfficeDoc(
             DirectoryEntry dir, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
         // Is it an embedded OLE2 document, or an embedded OOXML document?
@@ -103,56 +104,60 @@ abstract class AbstractPOIFSExtractor {
             // It's regular OLE2
         }
 
-       // Need to dump the directory out to a new temp file, so
-       //  it's stand along
-       POIFSFileSystem newFS = new POIFSFileSystem();
-       copy(dir, newFS.getRoot());
+        // What kind of document is it?
+        Metadata metadata = new Metadata();
+        POIFSDocumentType type = POIFSDocumentType.detectType(dir);
+        TikaInputStream embedded = null;
 
-       File tmpFile = File.createTempFile("tika", ".ole2");
-       try {
-           FileOutputStream out = new FileOutputStream(tmpFile);
-           newFS.writeFilesystem(out);
-           out.close();
-
-           // What kind of document is it?
-           Metadata metadata = new Metadata();
-           POIFSDocumentType type = POIFSDocumentType.detectType(dir);
-
-           TikaInputStream embedded;
-
-           if (type==POIFSDocumentType.OLE10_NATIVE) {
-               Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
-               ByteArrayOutputStream bos = new ByteArrayOutputStream();
-               IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
-               byte[] data = bos.toByteArray();
+        TemporaryResources tmp = new TemporaryResources();
+        try {
+            if (type == POIFSDocumentType.OLE10_NATIVE) {
+                Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
+                ByteArrayOutputStream bos = new ByteArrayOutputStream();
+                IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
+                byte[] data = bos.toByteArray();
 
-               try {
+                try {
                     Ole10Native ole = new Ole10Native(data, 0);
                     byte[] dataBuffer = ole.getDataBuffer();
 
                     metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());
 
                     embedded = TikaInputStream.get(dataBuffer);
-               } catch (Ole10NativeException ex) {
-                 embedded = TikaInputStream.get(data);
-               }
-           } else {
-               metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
-               metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
-
-               embedded = TikaInputStream.get(tmpFile);
-           }
+                } catch (Ole10NativeException ex) {
+                    embedded = TikaInputStream.get(data);
+                }
+                tmp.addResource(embedded);
+            } else {
+                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+                metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
+            }
 
-           try {
-               if (extractor.shouldParseEmbedded(metadata)) {
-                   extractor.parseEmbedded(embedded, xhtml, metadata, true);
-               }
-           } finally {
-               embedded.close();
-           }
-       } finally {
-           tmpFile.delete();
-       }
+            // Should we parse it?
+            if (extractor.shouldParseEmbedded(metadata)) {
+                if (embedded == null) {
+                    // Need to dump the directory out to a new temp file, so
+                    // it's stand alone
+
+                    // TODO: can/should we use NPOIFileSystem here?
+                    POIFSFileSystem newFS = new POIFSFileSystem();
+                    copy(dir, newFS.getRoot());
+                    File tmpFile = tmp.createTemporaryFile();
+                    FileOutputStream out = new FileOutputStream(tmpFile);
+                    try {
+                        newFS.writeFilesystem(out);
+                    } finally {
+                        out.close();
+                    }
+
+                    embedded = TikaInputStream.get(tmpFile);
+                    tmp.addResource(embedded);
+                }
+                extractor.parseEmbedded(embedded, xhtml, metadata, true);
+            }
+        } finally {
+            tmp.dispose();
+        }
     }
 
     protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
@@ -164,8 +169,12 @@ abstract class AbstractPOIFSExtractor {
                 copy((DirectoryEntry)entry, newDir);
             } else {
                 // Copy entry
-                InputStream contents = new DocumentInputStream((DocumentEntry)entry); 
-                destDir.createDocument(entry.getName(), contents);
+                InputStream contents = new DocumentInputStream((DocumentEntry)entry);
+                try {
+                    destDir.createDocument(entry.getName(), contents);
+                } finally {
+                    contents.close();
+                }
             }
         }
     }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Wed Oct 12 19:18:45 2011
@@ -141,7 +141,7 @@ public class ExcelExtractor extends Abst
             if (entry.getName().startsWith("MBD")
                     && entry instanceof DirectoryEntry) {
                 try {
-                    handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
                 } catch (TikaException e) {
                     // ignore parse errors from embedded documents
                 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Wed Oct 12 19:18:45 2011
@@ -231,7 +231,7 @@ public class OutlookExtractor extends Ab
                   );
                }
                if(attachment.attachmentDirectory != null) {
-                  handleEmbededOfficeDoc(
+                  handleEmbeddedOfficeDoc(
                         attachment.attachmentDirectory.getDirectory(),
                         xhtml
                   );

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1182534&r1=1182533&r2=1182534&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Wed Oct 12 19:18:45 2011
@@ -50,7 +50,6 @@ import org.xml.sax.helpers.AttributesImp
 
 public class WordExtractor extends AbstractPOIFSExtractor {
 
-    private static final char RECORD_SEPARATOR = 30;
     private static final char UNICODECHAR_NONBREAKING_HYPHEN = '\u2011';
     private static final char UNICODECHAR_ZERO_WIDTH_SPACE = '\u200b';
 
@@ -121,7 +120,7 @@ public class WordExtractor extends Abstr
             for (Entry entry : op) {
                 if (entry.getName().startsWith("_")
                         && entry instanceof DirectoryEntry) {
-                    handleEmbededOfficeDoc((DirectoryEntry) entry, xhtml);
+                    handleEmbeddedOfficeDoc((DirectoryEntry) entry, xhtml);
                 }
             }
         } catch(FileNotFoundException e) {