You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/09 16:11:47 UTC

svn commit: r995438 - in /tika/trunk: tika-core/src/main/java/org/apache/tika/config/ tika-core/src/main/java/org/apache/tika/extractor/ tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ tika-parsers/src/test/java/org/apache/tika/parser/micr...

Author: nick
Date: Thu Sep  9 14:11:46 2010
New Revision: 995438

URL: http://svn.apache.org/viewvc?rev=995438&view=rev
Log:
Support for container extraction of Images in .xls, and OOXML files embeded in OLE2 documents (TIKA-509)
Also rename ContainerEmbededResourceHandler to EmbededResourceHandler as suggested by Jukka, fix ParserContainerExtractor recursion, and remove ContainerExtractor from TikaConfig now we have ParserContainerExtractor.

Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java
      - copied, changed from r995359, tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java
Removed:
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java
Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Thu Sep  9 14:11:46 2010
@@ -20,10 +20,8 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
-import java.util.List;
 import java.util.Map;
 
 import javax.imageio.spi.ServiceRegistry;
@@ -32,7 +30,6 @@ import javax.xml.parsers.DocumentBuilder
 import javax.xml.parsers.ParserConfigurationException;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.mime.MimeTypeException;
@@ -54,9 +51,6 @@ public class TikaConfig {
     private final Map<MediaType, Parser> parsers =
         new HashMap<MediaType, Parser>();
     
-    private final List<ContainerExtractor> containerExtractors =
-        new ArrayList<ContainerExtractor>();
-
     private final MimeTypes mimeTypes;
 
     public TikaConfig(String file)
@@ -254,10 +248,6 @@ public class TikaConfig {
         return parsers;
     }
     
-    public List<ContainerExtractor> getContainerExtractors() {
-        return containerExtractors;
-    }
-
     public MimeTypes getMimeRepository(){
         return mimeTypes;
     }

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java Thu Sep  9 14:11:46 2010
@@ -45,7 +45,7 @@ public interface ContainerExtractor exte
      * Processes a container file, and extracts all the embeded
      * resources from within it.
      * <p>
-     * The {@link ContainerEmbededResourceHandler} you supply will
+     * The {@link EmbededResourceHandler} you supply will
      * be called for each embeded resource in the container. It is
      * up to you whether you process the contents of the resource or not. 
      * <p>
@@ -66,6 +66,6 @@ public interface ContainerExtractor exte
      */
     void extract(
             TikaInputStream stream, ContainerExtractor recurseExtractor,
-            ContainerEmbededResourceHandler handler)
+            EmbededResourceHandler handler)
             throws IOException, TikaException;
 }

Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java (from r995359, tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java&p1=tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java&r1=995359&r2=995438&rev=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java Thu Sep  9 14:11:46 2010
@@ -25,7 +25,7 @@ import org.apache.tika.mime.MediaType;
  * To work with a {@link ContainerExtractor}, your code needs
  *  to implement this interface.
  */
-public interface ContainerEmbededResourceHandler {
+public interface EmbededResourceHandler {
     /**
      * Called to process an embeded resource within the container.
      * This will be called once per embeded resource within the

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java Thu Sep  9 14:11:46 2010
@@ -71,8 +71,8 @@ public class ParserContainerExtractor im
     }
 
     public void extract(
-            TikaInputStream stream, ContainerExtractor recurseExtractor,
-            final ContainerEmbededResourceHandler handler)
+            TikaInputStream stream, final ContainerExtractor recurseExtractor,
+            final EmbededResourceHandler handler)
             throws IOException, TikaException {
         ParseContext context = new ParseContext();
         context.set(Parser.class, new Parser() {
@@ -82,9 +82,24 @@ public class ParserContainerExtractor im
             public void parse(InputStream stream, ContentHandler ignored,
                     Metadata metadata, ParseContext context)
                     throws IOException, SAXException, TikaException {
+                // Figure out what we have to process
                 String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
-                MediaType type = detector.detect(stream, metadata);
+                MediaType type;
+                if(metadata.get(Metadata.CONTENT_TYPE) != null) {
+                   type = MediaType.parse( metadata.get(Metadata.CONTENT_TYPE) );
+                } else {
+                   type = detector.detect(stream, metadata);
+                }
+                
+                // Let the handler process the embeded resource 
                 handler.handle(filename, type, stream);
+                
+                // Recurse if requested
+                if(recurseExtractor != null) {
+                   recurseExtractor.extract(
+                         TikaInputStream.get(stream), recurseExtractor, handler
+                   );
+                }
             }
             public void parse(InputStream stream, ContentHandler handler,
                     Metadata metadata) throws IOException, SAXException,

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Thu Sep  9 14:11:46 2010
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.microsoft;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -26,9 +27,11 @@ import org.apache.poi.poifs.filesystem.D
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.detect.ZipContainerDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -44,6 +47,28 @@ abstract class AbstractPOIFSExtractor {
     protected AbstractPOIFSExtractor(ParseContext context) {
         this.context = context;
     }
+    
+    protected void handleEmbededResource(TikaInputStream resource,
+          String filename, String mediaType, XHTMLContentHandler xhtml)
+          throws IOException, SAXException, TikaException {
+       try {
+           Metadata metadata = new Metadata();
+           if(filename != null) {
+              metadata.set(Metadata.TIKA_MIME_FILE, filename);
+           }
+           if(mediaType != null) {
+              metadata.set(Metadata.CONTENT_TYPE, mediaType);
+           }
+           
+           Parser parser = context.get(Parser.class, EmptyParser.INSTANCE);
+           parser.parse(
+                   resource, new EmbeddedContentHandler(xhtml),
+                   metadata, context
+           );
+       } finally {
+           resource.close();
+       }
+    }
 
     /**
      * Handle an office document that's embedded at the POIFS level
@@ -51,6 +76,22 @@ abstract class AbstractPOIFSExtractor {
     protected void handleEmbededOfficeDoc(
             DirectoryEntry dir, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
+       // Is it an embeded OLE2 document, or an embeded OOXML document?
+       try {
+          Entry ooxml = dir.getEntry("Package");
+          
+          // It's OOXML
+          TikaInputStream ooxmlStream = TikaInputStream.get(
+                new DocumentInputStream((DocumentEntry)ooxml)
+          );
+          ZipContainerDetector detector = new ZipContainerDetector();
+          MediaType type = detector.detect(ooxmlStream, new Metadata());
+          handleEmbededResource(ooxmlStream, null, type.toString(), xhtml);
+          return;
+       } catch(FileNotFoundException e) {
+          // It's regular OLE2
+       }
+       
        // Need to dump the directory out to a new temp file, so
        //  it's stand along
        POIFSFileSystem newFS = new POIFSFileSystem();

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Thu Sep  9 14:11:46 2010
@@ -27,15 +27,22 @@ import java.util.Map;
 import java.util.SortedMap;
 import java.util.TreeMap;
 
+import org.apache.poi.ddf.EscherBSERecord;
+import org.apache.poi.ddf.EscherBitmapBlip;
+import org.apache.poi.ddf.EscherBlipRecord;
+import org.apache.poi.ddf.EscherMetafileBlip;
+import org.apache.poi.ddf.EscherRecord;
 import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
 import org.apache.poi.hssf.eventusermodel.HSSFListener;
 import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.record.AbstractEscherHolderRecord;
 import org.apache.poi.hssf.record.BOFRecord;
 import org.apache.poi.hssf.record.BoundSheetRecord;
 import org.apache.poi.hssf.record.CellValueRecordInterface;
 import org.apache.poi.hssf.record.CountryRecord;
 import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.DrawingGroupRecord;
 import org.apache.poi.hssf.record.EOFRecord;
 import org.apache.poi.hssf.record.ExtendedFormatRecord;
 import org.apache.poi.hssf.record.FormatRecord;
@@ -50,11 +57,13 @@ import org.apache.poi.hssf.record.SSTRec
 import org.apache.poi.hssf.record.TextObjectRecord;
 import org.apache.poi.hssf.record.chart.SeriesTextRecord;
 import org.apache.poi.hssf.record.common.UnicodeString;
+import org.apache.poi.hssf.usermodel.HSSFPictureData;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -124,8 +133,8 @@ public class ExcelExtractor extends Abst
      */
     protected void parse(
             POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
-            Locale locale) throws IOException, SAXException {
-        TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
+            Locale locale) throws IOException, SAXException, TikaException {
+        TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
         listener.processFile(filesystem, isListenForAllRecords());
         listener.throwStoredException();
 
@@ -152,6 +161,11 @@ public class ExcelExtractor extends Abst
          * XHTML content handler to which the document content is rendered.
          */
         private final XHTMLContentHandler handler;
+        
+        /**
+         * The POIFS Extractor, used for embeded resources.
+         */
+        private final AbstractPOIFSExtractor extractor;
 
         /**
          * Potential exception thrown by the content handler. When set to
@@ -159,7 +173,7 @@ public class ExcelExtractor extends Abst
          * ignored and the stored exception to be thrown when
          * {@link #throwStoredException()} is invoked.
          */
-        private SAXException exception = null;
+        private Exception exception = null;
 
         private SSTRecord sstRecord;
         
@@ -201,6 +215,13 @@ public class ExcelExtractor extends Abst
          * @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
          */
         private final NumberFormat format;
+        
+        /**
+         * These aren't complete when we first see them, as the
+         *  depend on continue records that aren't always
+         *  contiguous. Collect them for later processing.
+         */
+        private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>();
 
         /**
          * Construct a new listener instance outputting parsed data to
@@ -208,8 +229,9 @@ public class ExcelExtractor extends Abst
          *
          * @param handler Destination to write the parsed output to
          */
-        private TikaHSSFListener(XHTMLContentHandler handler, Locale locale) {
+        private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
             this.handler = handler;
+            this.extractor = extractor;
             this.format = NumberFormat.getInstance(locale);
             this.formatListener = new FormatTrackingHSSFListener(this, locale);
         }
@@ -224,7 +246,7 @@ public class ExcelExtractor extends Abst
          * @throws SAXException on any SAX parsing errors.
          */
     	public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
-    		throws IOException,	SAXException {
+    		throws IOException, SAXException, TikaException {
 
     		// Set up listener and register the records we want to process
             HSSFRequest hssfRequest = new HSSFRequest();
@@ -247,6 +269,7 @@ public class ExcelExtractor extends Abst
                 hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
                 hssfRequest.addListener(formatListener, FormatRecord.sid);
                 hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
+                hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
             }
 
             // Create event factory and process Workbook (fire events)
@@ -256,6 +279,13 @@ public class ExcelExtractor extends Abst
             
             // Output any extra text that came after all the sheets
             processExtraText(); 
+            
+            // Look for embeded images, now that the drawing records
+            //  have been fully matched with their continue data
+            for(DrawingGroupRecord dgr : drawingGroups) {
+               dgr.decode();
+               findPictures(dgr.getEscherRecords());
+            }
     	}
 
         /**
@@ -267,19 +297,29 @@ public class ExcelExtractor extends Abst
             if (exception == null) {
                 try {
                     internalProcessRecord(record);
-                } catch (SAXException e) {
-                    exception = e;
+                } catch (TikaException te) {
+                   exception = te;
+                } catch (IOException ie) {
+                    exception = ie;
+                } catch (SAXException se) {
+                    exception = se;
                 }
             }
         }
 
-        public void throwStoredException() throws SAXException {
+        public void throwStoredException() throws TikaException, SAXException, IOException {
             if (exception != null) {
-                throw exception;
+                if(exception instanceof IOException)
+                   throw (IOException)exception;
+                if(exception instanceof SAXException)
+                   throw (SAXException)exception;
+                if(exception instanceof TikaException)
+                   throw (TikaException)exception;
+                throw new TikaException(exception.getMessage());
             }
         }
 
-        private void internalProcessRecord(Record record) throws SAXException {
+        private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException {
             switch (record.getSid()) {
             case BOFRecord.sid: // start of workbook, worksheet etc. records
                 BOFRecord bof = (BOFRecord) record;
@@ -366,6 +406,13 @@ public class ExcelExtractor extends Abst
                 SeriesTextRecord str = (SeriesTextRecord) record;
                 addTextCell(record, str.getText());
                 break;
+                
+            case DrawingGroupRecord.sid:
+               // Collect this now, we'll process later when all
+               //  the continue records are in
+               drawingGroups.add( (DrawingGroupRecord)record );
+               break;
+           
             }
             
             previousSid = record.getSid();
@@ -478,6 +525,55 @@ public class ExcelExtractor extends Abst
             handler.endElement("div");
         }
 
+        private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException {
+           for(EscherRecord escherRecord : records) {
+              if (escherRecord instanceof EscherBSERecord) {
+                 EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
+                 if (blip != null) {
+                    // TODO When we have upgraded POI, we can use this code instead
+                    //HSSFPictureData picture = new HSSFPictureData(blip);
+                    //String mimeType = picture.getMimeType();
+                    //TikaInputStream stream = TikaInputStream.get(picture.getData());
+                    
+                    // This code is cut'n'paste from a newer version of POI
+                    String mimeType = "";
+                    switch (blip.getRecordId()) {
+                    case EscherMetafileBlip.RECORD_ID_WMF:
+                       mimeType =  "application/x-wmf";
+                       break;
+                    case EscherMetafileBlip.RECORD_ID_EMF:
+                       mimeType =  "application/x-emf";
+                       break;
+                    case EscherMetafileBlip.RECORD_ID_PICT:
+                       mimeType =  "image/x-pict";
+                       break;
+                    case EscherBitmapBlip.RECORD_ID_PNG:
+                       mimeType =  "image/png";
+                       break;
+                    case EscherBitmapBlip.RECORD_ID_JPEG:
+                       mimeType =  "image/jpeg";
+                       break;
+                    case EscherBitmapBlip.RECORD_ID_DIB:
+                       mimeType =  "image/bmp";
+                       break;
+                    default:
+                       mimeType =  "image/unknown";
+                       break;
+                    }
+                    TikaInputStream stream = TikaInputStream.get(blip.getPicturedata());
+                    
+                    // Handle the embeded resource
+                    extractor.handleEmbededResource(
+                          stream, null, mimeType,
+                          handler
+                    );
+                 }
+              }
+
+              // Recursive call.
+              findPictures(escherRecord.getChildRecords());
+           }
+        }
     }
 
     /**

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Thu Sep  9 14:11:46 2010
@@ -22,8 +22,8 @@ import java.util.List;
 
 import junit.framework.TestCase;
 
-import org.apache.tika.extractor.ContainerEmbededResourceHandler;
 import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbededResourceHandler;
 import org.apache.tika.extractor.ParserContainerExtractor;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.mime.MediaType;
@@ -36,6 +36,13 @@ public class POIContainerExtractionTest 
     private static final MediaType TYPE_DOC = MediaType.application("msword");
     private static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
     private static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
+    private static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+    private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+    private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+    
+    private static final MediaType TYPE_JPG = MediaType.image("jpg");
+    private static final MediaType TYPE_PNG = MediaType.image("png");
+    private static final MediaType TYPE_EMF = MediaType.application("x-emf");
    
     /**
      * For office files which don't have anything embeded in them
@@ -72,9 +79,11 @@ public class POIContainerExtractionTest 
        
        // Excel with 1 image
        handler = process("testEXCEL_1img.xls", extractor, false);
-       // TODO
-       assertEquals(0, handler.filenames.size());
-       assertEquals(0, handler.mediaTypes.size());
+       assertEquals(1, handler.filenames.size());
+       assertEquals(1, handler.mediaTypes.size());
+       
+       assertEquals(null, handler.filenames.get(0));
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
        
        // PowerPoint with 2 images + sound
        // TODO
@@ -103,20 +112,27 @@ public class POIContainerExtractionTest 
        ContainerExtractor extractor = new ParserContainerExtractor();
        TrackingHandler handler;
        
+       
        // Excel with a word doc and a powerpoint doc, both of which have images in them
-       // Without recursion, should see both
+       // Without recursion, should see both documents + the images
        handler = process("testEXCEL_embeded.xls", extractor, false);
-       assertEquals(2, handler.filenames.size());
-       assertEquals(2, handler.mediaTypes.size());
+       assertEquals(5, handler.filenames.size());
+       assertEquals(5, handler.mediaTypes.size());
        
        // We don't know their filenames
        assertEquals(null, handler.filenames.get(0));
        assertEquals(null, handler.filenames.get(1));
+       assertEquals(null, handler.filenames.get(2));
+       assertEquals(null, handler.filenames.get(3));
+       assertEquals(null, handler.filenames.get(4));
        // But we do know their types
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(0));
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(1));
+       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embeded office doc
+       assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embeded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embeded office doc
+       assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embeded office doc
        
-       // With recursion, should get their images too
+       // With recursion, should get the images embeded in the office files too
        handler = process("testEXCEL_embeded.xls", extractor, true);
        // TODO
        
@@ -131,14 +147,27 @@ public class POIContainerExtractionTest 
        assertEquals(null, handler.filenames.get(1));
        assertEquals(null, handler.filenames.get(2));
        // But we do know their types
-       assertEquals(MediaType.application("x-tika-msoffice"), handler.mediaTypes.get(0)); // TODO
+       assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
        assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
        assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
        
+       
        // With recursion, should get their images too
        handler = process("testWORD_embeded.doc", extractor, true);
-       // TODO
+       // TODO - Not all resources of embeded files are currently extracted 
+       assertEquals(4, handler.filenames.size());
+       assertEquals(4, handler.mediaTypes.size());
        
+       // We don't know their filenames
+       assertEquals(null, handler.filenames.get(0));
+       assertEquals(null, handler.filenames.get(1));
+       assertEquals(null, handler.filenames.get(2));
+       assertEquals(null, handler.filenames.get(3));
+       // But we do know their types
+       assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // From xls
        
        // PowerPoint with excel and word
        // TODO
@@ -161,13 +190,17 @@ public class POIContainerExtractionTest 
         
         // Process it
         TrackingHandler handler = new TrackingHandler();
-        extractor.extract(stream, null, handler);
+        if(recurse) {
+           extractor.extract(stream, extractor, handler);
+        } else {
+           extractor.extract(stream, null, handler);
+        }
         
         // So they can check what happened
         return handler;
     }
     
-    private static class TrackingHandler implements ContainerEmbededResourceHandler {
+    private static class TrackingHandler implements EmbededResourceHandler {
        private List<String> filenames = new ArrayList<String>();
        private List<MediaType> mediaTypes = new ArrayList<MediaType>();