You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by mi...@apache.org on 2011/10/17 17:00:22 UTC

svn commit: r1185234 - in /tika/trunk: ./ tika-core/src/main/java/org/apache/tika/extractor/ tika-core/src/main/java/org/apache/tika/io/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/

Author: mikemccand
Date: Mon Oct 17 15:00:22 2011
New Revision: 1185234

URL: http://svn.apache.org/viewvc?rev=1185234&view=rev
Log:
TIKA-753: speed up processing of embedded office docs

Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java

Modified: tika/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Oct 17 15:00:22 2011
@@ -15,6 +15,8 @@ Release 0.11 - Current Development
  * TIKA-742: Paragraphs are now extracted within each page of a PDF
    document.
 
+ * TIKA-753: Improve performance when extracting embedded office docs.
+
 Release 0.10 - 09/25/2011
 
 The most notable changes in Tika 0.10 over previous releases are:

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java Mon Oct 17 15:00:22 2011
@@ -16,8 +16,6 @@
  */
 package org.apache.tika.extractor;
 
-import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
-
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
@@ -37,6 +35,8 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
+import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
+
 /**
  * Helper class for parsers of package archives or other compound document
  * formats that support embedded or attached component documents.
@@ -92,11 +92,26 @@ public class ParsingEmbeddedDocumentExtr
         // Use the delegate parser to parse this entry
         TemporaryResources tmp = new TemporaryResources();
         try {
+            final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
+            if (stream instanceof TikaInputStream) {
+                final Object container = ((TikaInputStream) stream).getOpenContainer();
+
+                // TODO: we can't let ZipPackage through,
+                // becase of POI bug 51949.  This is less
+                // efficient because the inner parser will
+                // have to re-open the zip archive again.
+                // Once we upgrade to POI 3.8 beta 5 we can
+                // remove this:
+                if ((container != null && !(container.getClass().getSimpleName().equals("ZipPackage")))) {
+                    newStream.setOpenContainer(container);
+                }
+            }
             DELEGATING_PARSER.parse(
-                    TikaInputStream.get(new CloseShieldInputStream(stream), tmp),
-                    new EmbeddedContentHandler(new BodyContentHandler(handler)),
-                    metadata, context);
+                                    newStream,
+                                    new EmbeddedContentHandler(new BodyContentHandler(handler)),
+                                    metadata, context);
         } catch (TikaException e) {
+            // TODO: can we log a warning somehow?
             // Could not parse the entry, just skip the content
         } finally {
             tmp.close();

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Mon Oct 17 15:00:22 2011
@@ -649,12 +649,15 @@ public class TikaInputStream extends Tag
     }
 
     public String toString() {
-       String str = "TikaInputStream of ";
-       if(hasFile()) {
-          str += file.toString();
-       } else {
-          str += in.toString();
-       }
-       return str;
+        String str = "TikaInputStream of ";
+        if (hasFile()) {
+            str += file.toString();
+        } else {
+            str += in.toString();
+        }
+        if (openContainer != null) {
+            str += " (in " + openContainer + ")";
+        }
+        return str;
     }
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Mon Oct 17 15:00:22 2011
@@ -17,11 +17,7 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.InputStream;
 
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
@@ -29,12 +25,10 @@ import org.apache.poi.poifs.filesystem.D
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.IOUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
-import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -64,11 +58,11 @@ abstract class AbstractPOIFSExtractor {
        try {
            Metadata metadata = new Metadata();
            if(filename != null) {
-              metadata.set(Metadata.TIKA_MIME_FILE, filename);
-              metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
+               metadata.set(Metadata.TIKA_MIME_FILE, filename);
+               metadata.set(Metadata.RESOURCE_NAME_KEY, filename);
            }
            if(mediaType != null) {
-              metadata.set(Metadata.CONTENT_TYPE, mediaType);
+               metadata.set(Metadata.CONTENT_TYPE, mediaType);
            }
 
            if (extractor.shouldParseEmbedded(metadata)) {
@@ -85,11 +79,13 @@ abstract class AbstractPOIFSExtractor {
     protected void handleEmbeddedOfficeDoc(
             DirectoryEntry dir, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
+
         // Is it an embedded OLE2 document, or an embedded OOXML document?
-        try {
+
+        if (dir.hasEntry("Package")) {
+            // It's OOXML (has a ZipFile):
             Entry ooxml = dir.getEntry("Package");
 
-            // It's OOXML
             TikaInputStream stream = TikaInputStream.get(
                     new DocumentInputStream((DocumentEntry) ooxml));
             try {
@@ -100,34 +96,35 @@ abstract class AbstractPOIFSExtractor {
             } finally {
                 stream.close();
             }
-        } catch(FileNotFoundException e) {
-            // It's regular OLE2
         }
 
+        // It's regular OLE2:
+
         // What kind of document is it?
         Metadata metadata = new Metadata();
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         TikaInputStream embedded = null;
 
-        TemporaryResources tmp = new TemporaryResources();
         try {
             if (type == POIFSDocumentType.OLE10_NATIVE) {
                 Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
                 ByteArrayOutputStream bos = new ByteArrayOutputStream();
+
+                // TODO: once we upgrade to POI 3.8 beta 5
+                // we can avoid this full copy/serialize by
+                // passing the DirectoryNode instead:
                 IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
                 byte[] data = bos.toByteArray();
 
                 try {
+                    // Maybe unwrap OLE10Native record:
                     Ole10Native ole = new Ole10Native(data, 0);
-                    byte[] dataBuffer = ole.getDataBuffer();
-
-                    metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());
-
-                    embedded = TikaInputStream.get(dataBuffer);
+                    data = ole.getDataBuffer();
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '/' + ole.getLabel());
                 } catch (Ole10NativeException ex) {
-                    embedded = TikaInputStream.get(data);
+                    // Not an OLE10Native record
                 }
-                tmp.addResource(embedded);
+                embedded = TikaInputStream.get(data);
             } else {
                 metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
                 metadata.set(Metadata.RESOURCE_NAME_KEY, dir.getName() + '.' + type.getExtension());
@@ -136,45 +133,18 @@ abstract class AbstractPOIFSExtractor {
             // Should we parse it?
             if (extractor.shouldParseEmbedded(metadata)) {
                 if (embedded == null) {
-                    // Need to dump the directory out to a new temp file, so
-                    // it's stand alone
-
-                    // TODO: can/should we use NPOIFileSystem here?
-                    POIFSFileSystem newFS = new POIFSFileSystem();
-                    copy(dir, newFS.getRoot());
-                    File tmpFile = tmp.createTemporaryFile();
-                    FileOutputStream out = new FileOutputStream(tmpFile);
-                    try {
-                        newFS.writeFilesystem(out);
-                    } finally {
-                        out.close();
-                    }
-
-                    embedded = TikaInputStream.get(tmpFile);
-                    tmp.addResource(embedded);
+                    // Make a TikaInputStream that just
+                    // passes the root directory of the
+                    // embedded document, and is otherwise
+                    // empty (byte[0]):
+                    embedded = TikaInputStream.get(new byte[0]);
+                    embedded.setOpenContainer(dir);
                 }
                 extractor.parseEmbedded(embedded, xhtml, metadata, true);
             }
         } finally {
-            tmp.dispose();
-        }
-    }
-
-    protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir)
-            throws IOException {
-        for (Entry entry : sourceDir) {
-            if (entry instanceof DirectoryEntry) {
-                // Need to recurse
-                DirectoryEntry newDir = destDir.createDirectory(entry.getName());
-                copy((DirectoryEntry)entry, newDir);
-            } else {
-                // Copy entry
-                InputStream contents = new DocumentInputStream((DocumentEntry)entry);
-                try {
-                    destDir.createDocument(entry.getName(), contents);
-                } finally {
-                    contents.close();
-                }
+            if (embedded != null) {
+                embedded.close();
             }
         }
     }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Mon Oct 17 15:00:22 2011
@@ -57,6 +57,7 @@ import org.apache.poi.hssf.record.TextOb
 import org.apache.poi.hssf.record.chart.SeriesTextRecord;
 import org.apache.poi.hssf.record.common.UnicodeString;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
@@ -133,11 +134,17 @@ public class ExcelExtractor extends Abst
     protected void parse(
             NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml,
             Locale locale) throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml, locale);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml,
+            Locale locale) throws IOException, SAXException, TikaException {
         TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
-        listener.processFile(filesystem, isListenForAllRecords());
+        listener.processFile(root, isListenForAllRecords());
         listener.throwStoredException();
 
-        for (Entry entry : filesystem.getRoot()) {
+        for (Entry entry : root) {
             if (entry.getName().startsWith("MBD")
                     && entry instanceof DirectoryEntry) {
                 try {
@@ -246,6 +253,11 @@ public class ExcelExtractor extends Abst
          */
     	public void processFile(NPOIFSFileSystem filesystem, boolean listenForAllRecords)
     		throws IOException, SAXException, TikaException {
+            processFile(filesystem.getRoot(), listenForAllRecords);
+        }
+
+    	public void processFile(DirectoryNode root, boolean listenForAllRecords)
+    		throws IOException, SAXException, TikaException {
 
     		// Set up listener and register the records we want to process
             HSSFRequest hssfRequest = new HSSFRequest();
@@ -272,7 +284,7 @@ public class ExcelExtractor extends Abst
             }
 
             // Create event factory and process Workbook (fire events)
-            DocumentInputStream documentInputStream = filesystem.createDocumentInputStream("Workbook");
+            DocumentInputStream documentInputStream = root.createDocumentInputStream("Workbook");
             HSSFEventFactory eventFactory = new HSSFEventFactory();
             try {
                 eventFactory.processEvents(hssfRequest, documentInputStream);

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Mon Oct 17 15:00:22 2011
@@ -29,6 +29,7 @@ import org.apache.poi.hslf.model.Slide;
 import org.apache.poi.hslf.model.TextRun;
 import org.apache.poi.hslf.usermodel.ObjectData;
 import org.apache.poi.hslf.usermodel.SlideShow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -44,7 +45,13 @@ public class HSLFExtractor extends Abstr
    protected void parse(
          NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
          throws IOException, SAXException, TikaException {
-      HSLFSlideShow ss = new HSLFSlideShow(filesystem.getRoot());
+       parse(filesystem.getRoot(), xhtml);
+   }
+    
+   protected void parse(
+         DirectoryNode root, XHTMLContentHandler xhtml)
+         throws IOException, SAXException, TikaException {
+      HSLFSlideShow ss = new HSLFSlideShow(root);
       SlideShow _show = new SlideShow(ss);
       Slide[] _slides = _show.getSlides();
 

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java Mon Oct 17 15:00:22 2011
@@ -21,8 +21,10 @@ import java.io.InputStream;
 import java.security.GeneralSecurityException;
 import java.util.Arrays;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
 
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
@@ -30,6 +32,7 @@ import org.apache.poi.hpbf.extractor.Pub
 import org.apache.poi.poifs.crypt.Decryptor;
 import org.apache.poi.poifs.crypt.EncryptionInfo;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -115,37 +118,27 @@ public class OfficeParser extends Abstra
             return UNKNOWN;
         }
 
+        private final static Map<String,POIFSDocumentType> typeMap = new HashMap<String,POIFSDocumentType>();
+        static {
+            typeMap.put("Workbook", WORKBOOK);
+            typeMap.put("EncryptedPackage", ENCRYPTED);
+            typeMap.put("WordDocument", WORDDOCUMENT);
+            typeMap.put("Quill", PUBLISHER);
+            typeMap.put("PowerPoint Document", POWERPOINT);
+            typeMap.put("VisioDocument", VISIO);
+            typeMap.put("CONTENTS", WORKS);
+            typeMap.put("\u0001Ole10Native", POIFSDocumentType.OLE10_NATIVE);
+        }
+
         public static POIFSDocumentType detectType(Entry entry) {
             String name = entry.getName();
-
-            if ("Workbook".equals(name)) {
-                return WORKBOOK;
-            }
-            if ("EncryptedPackage".equals(name)) {
-                return ENCRYPTED;
-            }
-            if ("WordDocument".equals(name)) {
-                return WORDDOCUMENT;
+            POIFSDocumentType type = typeMap.get(name);
+            if (type != null) {
+                return type;
             }
-            if ("Quill".equals(name)) {
-                return PUBLISHER;
-            }
-            if ("PowerPoint Document".equals(entry.getName())) {
-                return POWERPOINT;
-            }
-            if ("VisioDocument".equals(entry.getName())) {
-                return VISIO;
-            }
-            if ("CONTENTS".equals(entry.getName())) {
-               return WORKS;
-           }
             if (entry.getName().startsWith("__substg1.0_")) {
                 return OUTLOOK;
             }
-            if ("\u0001Ole10Native".equals(name)) {
-              return POIFSDocumentType.OLE10_NATIVE;
-            }
-
             return UNKNOWN;
         }
     }
@@ -164,26 +157,36 @@ public class OfficeParser extends Abstra
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
 
-        NPOIFSFileSystem filesystem;
+        final DirectoryNode root;
         TikaInputStream tstream = TikaInputStream.cast(stream);
         if (tstream == null) {
-            filesystem =
-                new NPOIFSFileSystem(new CloseShieldInputStream(stream));
-        } else if (tstream.getOpenContainer() instanceof NPOIFSFileSystem) {
-            filesystem = (NPOIFSFileSystem) tstream.getOpenContainer();
-        } else if (tstream.hasFile()) {
-            filesystem = new NPOIFSFileSystem(tstream.getFileChannel());
+            root = new NPOIFSFileSystem(new CloseShieldInputStream(stream)).getRoot();
         } else {
-            filesystem =
-                new NPOIFSFileSystem(new CloseShieldInputStream(tstream));
+            final Object container = tstream.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                root = ((NPOIFSFileSystem) container).getRoot();
+            } else if (container instanceof DirectoryNode) {
+                root = (DirectoryNode) container;
+            } else if (tstream.hasFile()) {
+                root = new NPOIFSFileSystem(tstream.getFileChannel()).getRoot();
+            } else {
+                root = new NPOIFSFileSystem(new CloseShieldInputStream(tstream)).getRoot();
+            }
         }
+        parse(root, context, metadata, xhtml);
+        xhtml.endDocument();
+    }
+
+    protected void parse(
+            DirectoryNode root, ParseContext context, Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
 
         // Parse summary entries first, to make metadata available early
-        new SummaryExtractor(metadata).parseSummaries(filesystem);
+        new SummaryExtractor(metadata).parseSummaries(root);
 
         // Parse remaining document entries
         boolean outlookExtracted = false;
-        for (Entry entry : filesystem.getRoot()) {
+        for (Entry entry : root) {
             POIFSDocumentType type = POIFSDocumentType.detectType(entry);
 
             if (type!=POIFSDocumentType.UNKNOWN) {
@@ -193,22 +196,22 @@ public class OfficeParser extends Abstra
             switch (type) {
                 case PUBLISHER:
                     PublisherTextExtractor publisherTextExtractor =
-                        new PublisherTextExtractor(filesystem);
+                        new PublisherTextExtractor(root);
                     xhtml.element("p", publisherTextExtractor.getText());
                     break;
                 case WORDDOCUMENT:
-                    new WordExtractor(context).parse(filesystem, xhtml);
+                    new WordExtractor(context).parse(root, xhtml);
                     break;
                 case POWERPOINT:
-                    new HSLFExtractor(context).parse(filesystem, xhtml);
+                    new HSLFExtractor(context).parse(root, xhtml);
                     break;
                 case WORKBOOK:
                     Locale locale = context.get(Locale.class, Locale.getDefault());
-                    new ExcelExtractor(context).parse(filesystem, xhtml, locale);
+                    new ExcelExtractor(context).parse(root, xhtml, locale);
                     break;
                 case VISIO:
                     VisioTextExtractor visioTextExtractor =
-                        new VisioTextExtractor(filesystem);
+                        new VisioTextExtractor(root);
                     for (String text : visioTextExtractor.getAllText()) {
                         xhtml.element("p", text);
                     }
@@ -218,13 +221,13 @@ public class OfficeParser extends Abstra
                         outlookExtracted = true;
 
                         OutlookExtractor extractor =
-                            new OutlookExtractor(filesystem, context);
+                            new OutlookExtractor(root, context);
 
                         extractor.parse(xhtml, metadata);
                     }
                     break;
                 case ENCRYPTED:
-                    EncryptionInfo info = new EncryptionInfo(filesystem);
+                    EncryptionInfo info = new EncryptionInfo(root);
                     Decryptor d = Decryptor.getInstance(info);
 
                     try {
@@ -234,7 +237,7 @@ public class OfficeParser extends Abstra
 
                         OOXMLParser parser = new OOXMLParser();
 
-                        parser.parse(d.getDataStream(filesystem), new EmbeddedContentHandler(
+                        parser.parse(d.getDataStream(root), new EmbeddedContentHandler(
                                         new BodyContentHandler(xhtml)),
                                         metadata, context);
                     } catch (GeneralSecurityException ex) {
@@ -242,8 +245,6 @@ public class OfficeParser extends Abstra
                     }
             }
         }
-
-        xhtml.endDocument();
     }
 
     private void setType(Metadata metadata, MediaType type) {

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java Mon Oct 17 15:00:22 2011
@@ -30,6 +30,7 @@ import org.apache.poi.hsmf.datatypes.MAP
 import org.apache.poi.hsmf.datatypes.StringChunk;
 import org.apache.poi.hsmf.datatypes.Types;
 import org.apache.poi.hsmf.exceptions.ChunkNotFoundException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
@@ -51,10 +52,14 @@ public class OutlookExtractor extends Ab
     private final MAPIMessage msg;
 
     public OutlookExtractor(NPOIFSFileSystem filesystem, ParseContext context) throws TikaException {
+        this(filesystem.getRoot(), context);
+    }
+
+    public OutlookExtractor(DirectoryNode root, ParseContext context) throws TikaException {
         super(context);
         
         try {
-            this.msg = new MAPIMessage(filesystem.getRoot());
+            this.msg = new MAPIMessage(root);
         } catch (IOException e) {
             throw new TikaException("Failed to parse Outlook message", e);
         }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java Mon Oct 17 15:00:22 2011
@@ -16,8 +16,6 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import static org.apache.tika.mime.MediaType.application;
-
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.channels.FileChannel;
@@ -25,6 +23,7 @@ import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;
 
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.detect.Detector;
@@ -32,6 +31,8 @@ import org.apache.tika.io.TikaInputStrea
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 
+import static org.apache.tika.mime.MediaType.application;
+
 /**
  * A detector that works on a POIFS OLE2 document
  *  to figure out exactly what the file is.
@@ -76,24 +77,42 @@ public class POIFSContainerDetector impl
             return MediaType.OCTET_STREAM;
         }
 
-        // Check if the document starts with the OLE header
-        input.mark(8);
-        try {
-            if (input.read() != 0xd0 || input.read() != 0xcf
+        // If this is a TikaInputStream wrapping an already
+        // parsed NPOIFileSystem/DirectoryNode, just get the
+        // names from the root:
+        TikaInputStream tis = TikaInputStream.cast(input);
+        Set<String> names = null;
+        if (tis != null) {
+            Object container = tis.getOpenContainer();
+            if (container instanceof NPOIFSFileSystem) {
+                names = getTopLevelNames(((NPOIFSFileSystem) container).getRoot());
+            } else if (container instanceof DirectoryNode) {
+                names = getTopLevelNames((DirectoryNode) container);
+            }
+        }
+
+        if (names == null) {
+            // Check if the document starts with the OLE header
+            input.mark(8);
+            try {
+                if (input.read() != 0xd0 || input.read() != 0xcf
                     || input.read() != 0x11 || input.read() != 0xe0
                     || input.read() != 0xa1 || input.read() != 0xb1
                     || input.read() != 0x1a || input.read() != 0xe1) {
-                return MediaType.OCTET_STREAM;
+                    return MediaType.OCTET_STREAM;
+                }
+            } finally {
+                input.reset();
             }
-        } finally {
-            input.reset();
         }
 
         // We can only detect the exact type when given a TikaInputStream
-        TikaInputStream tis = TikaInputStream.cast(input);
-        if (tis != null) {
+        if (names == null && tis != null) {
             // Look for known top level entry names to detect the document type
-            Set<String> names = getTopLevelNames(tis);
+            names = getTopLevelNames(tis);
+        }
+
+        if (names != null) {
             if (names.contains("Workbook")) {
                 return XLS;
             } else if (names.contains("EncryptedPackage")) {
@@ -149,11 +168,7 @@ public class POIFSContainerDetector impl
             // a reference to the already opened POI file system
             stream.setOpenContainer(fs);
 
-            Set<String> names = new HashSet<String>();
-            for (Entry entry : fs.getRoot()) {
-                names.add(entry.getName());
-            }
-            return names;
+            return getTopLevelNames(fs.getRoot());
         } catch (IOException e) {
             // Parse error in POI, so we don't know the file type
             return Collections.emptySet();
@@ -163,4 +178,11 @@ public class POIFSContainerDetector impl
         }
     }
 
+    private static Set<String> getTopLevelNames(DirectoryNode root) {
+        Set<String> names = new HashSet<String>();
+        for (Entry entry : root) {
+            names.add(entry.getName());
+        }
+        return names;
+    }
 }

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java Mon Oct 17 15:00:22 2011
@@ -27,6 +27,7 @@ import org.apache.poi.hpsf.NoPropertySet
 import org.apache.poi.hpsf.PropertySet;
 import org.apache.poi.hpsf.SummaryInformation;
 import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
@@ -54,16 +55,21 @@ class SummaryExtractor {
 
     public void parseSummaries(NPOIFSFileSystem filesystem)
             throws IOException, TikaException {
-        parseSummaryEntryIfExists(filesystem, SUMMARY_INFORMATION);
-        parseSummaryEntryIfExists(filesystem, DOCUMENT_SUMMARY_INFORMATION);
+        parseSummaries(filesystem.getRoot());
+    }
+
+    public void parseSummaries(DirectoryNode root)
+            throws IOException, TikaException {
+        parseSummaryEntryIfExists(root, SUMMARY_INFORMATION);
+        parseSummaryEntryIfExists(root, DOCUMENT_SUMMARY_INFORMATION);
     }
 
     private void parseSummaryEntryIfExists(
-            NPOIFSFileSystem filesystem, String entryName)
+            DirectoryNode root, String entryName)
             throws IOException, TikaException {
         try {
             DocumentEntry entry =
-                (DocumentEntry) filesystem.getRoot().getEntry(entryName);
+                (DocumentEntry) root.getEntry(entryName);
             PropertySet properties =
                 new PropertySet(new DocumentInputStream(entry));
             if (properties.isSummaryInformation()) {
@@ -134,7 +140,7 @@ class SummaryExtractor {
      * Attempt to parse custom document properties and add to the collection of metadata
      * @param customProperties
      */
-    private void parse(CustomProperties customProperties){
+    private void parse(CustomProperties customProperties) {
         if (customProperties != null) {
             for (String name : customProperties.nameSet()) {
                 // Apply the custom prefix

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1185234&r1=1185233&r2=1185234&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Mon Oct 17 15:00:22 2011
@@ -39,6 +39,7 @@ import org.apache.poi.hwpf.usermodel.Tab
 import org.apache.poi.hwpf.usermodel.TableCell;
 import org.apache.poi.hwpf.usermodel.TableRow;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
@@ -65,11 +66,17 @@ public class WordExtractor extends Abstr
     protected void parse(
             NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
+        parse(filesystem.getRoot(), xhtml);
+    }
+
+    protected void parse(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
         HWPFDocument document;
         try {
-            document = new HWPFDocument(filesystem.getRoot());
+            document = new HWPFDocument(root);
         } catch(OldWordFileFormatException e) {
-            parseWord6(filesystem, xhtml);
+            parseWord6(root, xhtml);
             return;
         }
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
@@ -115,8 +122,7 @@ public class WordExtractor extends Abstr
         
         // Handle any embeded office documents
         try {
-            DirectoryEntry op =
-                (DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");
+            DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
             for (Entry entry : op) {
                 if (entry.getName().startsWith("_")
                         && entry instanceof DirectoryEntry) {
@@ -418,50 +424,63 @@ public class WordExtractor extends Abstr
     protected void parseWord6(
             NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
-        HWPFOldDocument doc = new HWPFOldDocument(filesystem.getRoot());
+        parseWord6(filesystem.getRoot(), xhtml);
+    }
+
+    protected void parseWord6(
+            DirectoryNode root, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HWPFOldDocument doc = new HWPFOldDocument(root);
         Word6Extractor extractor = new Word6Extractor(doc);
         
         for(String p : extractor.getParagraphText()) {
             xhtml.element("p", p);
         }
     }
+
+    private static final Map<String,TagAndStyle> fixedParagraphStyles = new HashMap<String,TagAndStyle>();
+    private static final TagAndStyle defaultParagraphStyle = new TagAndStyle("p", null);
+    static {
+        fixedParagraphStyles.put("Default", defaultParagraphStyle);
+        fixedParagraphStyles.put("Normal", defaultParagraphStyle);
+        fixedParagraphStyles.put("heading", new TagAndStyle("h1", null));
+        fixedParagraphStyles.put("Heading", new TagAndStyle("h1", null));
+        fixedParagraphStyles.put("Title", new TagAndStyle("h1", "title"));
+        fixedParagraphStyles.put("Subtitle", new TagAndStyle("h2", "subtitle"));
+        fixedParagraphStyles.put("HTML Preformatted", new TagAndStyle("pre", null));
+    }
     
     /**
      * Given a style name, return what tag should be used, and
      *  what style should be applied to it. 
      */
     public static TagAndStyle buildParagraphTagAndStyle(String styleName, boolean isTable) {
+       TagAndStyle tagAndStyle = fixedParagraphStyles.get(styleName);
+       if (tagAndStyle != null) {
+           return tagAndStyle;
+       }
+
+       if (styleName.equals("Table Contents") && isTable) {
+           return defaultParagraphStyle;
+       }
+
        String tag = "p";
        String styleClass = null;
-       
-       if(styleName.equals("Default") || styleName.equals("Normal")) {
-          // Already setup
-       } else if(styleName.equals("Table Contents") && isTable) {
-          // Already setup
-       } else if(styleName.equals("heading") || styleName.equals("Heading")) {
-          tag = "h1";
-       } else if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
-          // "Heading 3" or "Heading2" or "heading 4"
-          int num = 1;
-          try {
-             num = Integer.parseInt( 
-                   styleName.substring(styleName.length()-1)
-             );
-          } catch(NumberFormatException e) {}
-          // Turn it into a H1 - H6 (H7+ isn't valid!)
-          tag = "h" + Math.min(num, 6);
-       } else if(styleName.equals("Title")) {
-          tag = "h1";
-          styleClass = "title";
-       } else if(styleName.equals("Subtitle")) {
-          tag = "h2";
-          styleClass = "subtitle";
-       } else if(styleName.equals("HTML Preformatted")) {
-          tag = "pre";
+
+       if(styleName.startsWith("heading") || styleName.startsWith("Heading")) {
+           // "Heading 3" or "Heading2" or "heading 4"
+           int num = 1;
+           try {
+               num = Integer.parseInt(
+                                      styleName.substring(styleName.length()-1)
+                                      );
+           } catch(NumberFormatException e) {}
+           // Turn it into a H1 - H6 (H7+ isn't valid!)
+           tag = "h" + Math.min(num, 6);
        } else {
-          styleClass = styleName.replace(' ', '_');
-          styleClass = styleClass.substring(0,1).toLowerCase() +
-                         styleClass.substring(1);
+           styleClass = styleName.replace(' ', '_');
+           styleClass = styleClass.substring(0,1).toLowerCase() +
+               styleClass.substring(1);
        }
 
        return new TagAndStyle(tag,styleClass);