You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/09 17:29:38 UTC

svn commit: r995462 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/WordExtractor.java test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Author: nick
Date: Thu Sep  9 15:29:37 2010
New Revision: 995462

URL: http://svn.apache.org/viewvc?rev=995462&view=rev
Log:
Add support for extracting images embeded in Word .doc files - TIKA-509

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=995462&r1=995461&r2=995462&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Thu Sep  9 15:29:37 2010
@@ -18,11 +18,16 @@ package org.apache.tika.parser.microsoft
 
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.util.List;
 
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
@@ -36,8 +41,9 @@ public class WordExtractor extends Abstr
     protected void parse(
             POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
+        HWPFDocument document = new HWPFDocument(filesystem);
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
-            new org.apache.poi.hwpf.extractor.WordExtractor(filesystem);
+            new org.apache.poi.hwpf.extractor.WordExtractor(document);
 
         addTextIfAny(xhtml, "header", wordExtractor.getHeaderText());
 
@@ -59,6 +65,45 @@ public class WordExtractor extends Abstr
 
         addTextIfAny(xhtml, "footer", wordExtractor.getFooterText());
 
+        // Handle any embeded images
+        PicturesTable pictureTable = document.getPicturesTable();
+        if(pictureTable != null) {
+           List<Picture> pictures = (List<Picture>)pictureTable.getAllPictures(); // TODO Generics fixed in newer version
+           for(Picture picture : pictures) {
+              // TODO When we have upgraded POI, we can use this code instead
+              //String mimeType = picture.getMimeType();
+              
+              // This code is cut'n'paste from a newer version of POI
+              String mimeType = "image/unknown";
+              String extension = picture.suggestFileExtension();
+              if("jpg".equals(extension)) {
+                 mimeType =  "image/jpeg";
+              }
+              if("png".equals(extension)) {
+                 mimeType =  "image/png";
+              }
+              if("gif".equals(extension)) {
+                 mimeType =  "image/gif";
+              }
+              if("bmp".equals(extension)) {
+                 mimeType =  "image/bmp";
+              }
+              if("tiff".equals(extension)) {
+                 mimeType =  "image/tiff";
+              }
+              if("wmf".equals(extension)) {
+                 mimeType =  "application/x-wmf";
+              }
+              if("emf".equals(extension)) {
+                 mimeType =  "application/x-emf";
+              }
+              
+              TikaInputStream stream = TikaInputStream.get(picture.getContent());
+              handleEmbededResource(stream, null, mimeType, xhtml);
+           }
+        }
+        
+        // Handle any embeded office documents
         try {
             DirectoryEntry op =
                 (DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=995462&r1=995461&r2=995462&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Thu Sep  9 15:29:37 2010
@@ -40,7 +40,7 @@ public class POIContainerExtractionTest 
     private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
     private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
     
-    private static final MediaType TYPE_JPG = MediaType.image("jpg");
+    private static final MediaType TYPE_JPG = MediaType.image("jpeg");
     private static final MediaType TYPE_PNG = MediaType.image("png");
     private static final MediaType TYPE_EMF = MediaType.application("x-emf");
    
@@ -84,15 +84,31 @@ public class POIContainerExtractionTest 
        
        assertEquals(null, handler.filenames.get(0));
        assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
        
        // PowerPoint with 2 images + sound
        // TODO
        
        // Word with 1 image
-       // TODO
+       handler = process("testWORD_1img.doc", extractor, false);
+       assertEquals(1, handler.filenames.size());
+       assertEquals(1, handler.mediaTypes.size());
+       
+       assertEquals(null, handler.filenames.get(0));
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
        
        // Word with 3 images
-       // TODO
+       handler = process("testWORD_3imgs.doc", extractor, false);
+       assertEquals(3, handler.filenames.size());
+       assertEquals(3, handler.mediaTypes.size());
+       
+       assertEquals(null, handler.filenames.get(0));
+       assertEquals(null, handler.filenames.get(1));
+       assertEquals(null, handler.filenames.get(2));
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
     }
     
     /**
@@ -132,42 +148,65 @@ public class POIContainerExtractionTest 
        assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embeded office doc
        assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embeded office doc
        
+       
        // With recursion, should get the images embeded in the office files too
        handler = process("testEXCEL_embeded.xls", extractor, true);
-       // TODO
+       assertEquals(6, handler.filenames.size());
+       assertEquals(6, handler.mediaTypes.size());
+       
+       assertEquals(null, handler.filenames.get(0));
+       assertEquals(null, handler.filenames.get(1));
+       assertEquals(null, handler.filenames.get(2));
+       assertEquals(null, handler.filenames.get(3));
+       assertEquals(null, handler.filenames.get(4));
+       assertEquals(null, handler.filenames.get(5));
+       
+       assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embeded office doc
+       assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embeded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embeded office doc
+       assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embeded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); //   PNG inside .doc
        
        
        // Word with .docx, powerpoint and excel
        handler = process("testWORD_embeded.doc", extractor, false);
-       assertEquals(3, handler.filenames.size());
-       assertEquals(3, handler.mediaTypes.size());
+       assertEquals(8, handler.filenames.size());
+       assertEquals(8, handler.mediaTypes.size());
        
        // We don't know their filenames
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
+       for(String filename : handler.filenames)
+          assertEquals(null, filename);
        // But we do know their types
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
+       assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embeded office doc?
+       assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(1)); // Icon of embeded office doc?
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(3)); // Embeded image
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embeded image
+       assertEquals(TYPE_DOCX, handler.mediaTypes.get(5)); // Embeded office doc
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(6)); // Embeded office doc
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(7)); // Embeded office doc
        
        
        // With recursion, should get their images too
        handler = process("testWORD_embeded.doc", extractor, true);
        // TODO - Not all resources of embeded files are currently extracted 
-       assertEquals(4, handler.filenames.size());
-       assertEquals(4, handler.mediaTypes.size());
+       assertEquals(9, handler.filenames.size());
+       assertEquals(9, handler.mediaTypes.size());
        
        // We don't know their filenames
-       assertEquals(null, handler.filenames.get(0));
-       assertEquals(null, handler.filenames.get(1));
-       assertEquals(null, handler.filenames.get(2));
-       assertEquals(null, handler.filenames.get(3));
+       for(String filename : handler.filenames)
+          assertEquals(null, filename);
        // But we do know their types
-       assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
-       assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // From xls
+       assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embeded office doc?
+       assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(1)); // Icon of embeded office doc?
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+       assertEquals(TYPE_JPG, handler.mediaTypes.get(3)); // Embeded image
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embeded image
+       assertEquals(TYPE_DOCX, handler.mediaTypes.get(5)); // Embeded office doc
+       assertEquals(TYPE_PPT, handler.mediaTypes.get(6)); // Embeded office doc
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(7)); // Embeded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); //    PNG inside .xls
        
        // PowerPoint with excel and word
        // TODO
@@ -182,8 +221,9 @@ public class POIContainerExtractionTest 
     }
     
     private TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
-       InputStream input = POIContainerExtractionTest.class.getResourceAsStream(
+        InputStream input = POIContainerExtractionTest.class.getResourceAsStream(
              "/test-documents/" + filename);
+        assertNotNull(filename + " not found", input);
         TikaInputStream stream = TikaInputStream.get(input);
         
         assertEquals(true, extractor.isSupported(stream));