You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2012/03/23 13:05:22 UTC

svn commit: r1304294 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java test/resources/test-documents/pictures.ppt

Author: maxcom
Date: Fri Mar 23 12:05:21 2012
New Revision: 1304294

URL: http://svn.apache.org/viewvc?rev=1304294&view=rev
Log:
TIKA-883 - Extract embedded images in PPT

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1304294&r1=1304293&r2=1304294&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri Mar 23 12:05:21 2012
@@ -16,18 +16,10 @@
  */
 package org.apache.tika.parser.microsoft;
 
-import java.io.IOException;
-import java.util.HashSet;
-
 import org.apache.poi.hslf.HSLFSlideShow;
-import org.apache.poi.hslf.model.Comment;
-import org.apache.poi.hslf.model.HeadersFooters;
-import org.apache.poi.hslf.model.Notes;
-import org.apache.poi.hslf.model.OLEShape;
-import org.apache.poi.hslf.model.Shape;
-import org.apache.poi.hslf.model.Slide;
-import org.apache.poi.hslf.model.TextRun;
+import org.apache.poi.hslf.model.*;
 import org.apache.poi.hslf.usermodel.ObjectData;
+import org.apache.poi.hslf.usermodel.PictureData;
 import org.apache.poi.hslf.usermodel.SlideShow;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
@@ -37,6 +29,9 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.xml.sax.SAXException;
 
+import java.io.IOException;
+import java.util.HashSet;
+
 public class HSLFExtractor extends AbstractPOIFSExtractor {
    public HSLFExtractor(ParseContext context) {
       super(context);
@@ -164,6 +159,8 @@ public class HSLFExtractor extends Abstr
          }
       }
 
+      handleSlideEmbeddedPictures(_show, xhtml);
+
       xhtml.endElement("div");
    }
 
@@ -181,7 +178,36 @@ public class HSLFExtractor extends Abstr
       }
    }
 
-   private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml) 
+    private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml)
+            throws TikaException, SAXException, IOException {
+        for (PictureData pic : slideshow.getPictureData()) {
+            String mediaType = null;
+
+            switch (pic.getType()) {
+                case Picture.EMF:
+                    mediaType = "application/x-emf";
+                    break;
+                case Picture.JPEG:
+                    mediaType = "image/jpeg";
+                    break;
+                case Picture.PNG:
+                    mediaType = "image/png";
+                    break;
+                case Picture.WMF:
+                    mediaType = "application/x-msmetafile";
+                    break;
+                case Picture.DIB:
+                    mediaType = "image/bmp";
+                    break;
+            }
+
+            handleEmbeddedResource(
+                  TikaInputStream.get(pic.getData()), null,
+                  mediaType, xhtml, false);
+        }
+    }
+
+    private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
                 throws TikaException, SAXException, IOException {
       Shape[] shapes;
       try {

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1304294&r1=1304293&r2=1304294&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Mar 23 12:05:21 2012
@@ -134,8 +134,8 @@ public class POIContainerExtractionTest 
        
        // With recursion, should get the images embedded in the office files too
        handler = process("testEXCEL_embeded.xls", extractor, true);
-       assertEquals(12, handler.filenames.size());
-       assertEquals(12, handler.mediaTypes.size());
+       assertEquals(17, handler.filenames.size());
+       assertEquals(17, handler.mediaTypes.size());
        
        assertEquals(null, handler.filenames.get(0));
        assertEquals(null, handler.filenames.get(1));
@@ -147,7 +147,7 @@ public class POIContainerExtractionTest 
        assertEquals("image1.png", handler.filenames.get(7));
        assertEquals("image2.jpg", handler.filenames.get(8));
        assertEquals("image3.png", handler.filenames.get(9));
-       assertEquals("image1.png", handler.filenames.get(11));
+       assertEquals("image1.png", handler.filenames.get(16));
 
        assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
        assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
@@ -159,8 +159,8 @@ public class POIContainerExtractionTest 
        assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
        assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
        assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
-       assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // Embedded office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // Embedded image
+       assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
 
        // Word with .docx, powerpoint and excel
        handler = process("testWORD_embeded.doc", extractor, false);
@@ -193,8 +193,8 @@ public class POIContainerExtractionTest 
        
        // With recursion, should get their images too
        handler = process("testWORD_embeded.doc", extractor, true);
-       assertEquals(13, handler.filenames.size());
-       assertEquals(13, handler.mediaTypes.size());
+       assertEquals(16, handler.filenames.size());
+       assertEquals(16, handler.mediaTypes.size());
        
        // We don't know their filenames, except for doc images + docx
        assertEquals("image1.emf", handler.filenames.get(0));
@@ -207,7 +207,7 @@ public class POIContainerExtractionTest 
        assertEquals("image2.png", handler.filenames.get(7));
        assertEquals("image3.jpeg", handler.filenames.get(8));
        assertEquals("image4.png", handler.filenames.get(9));
-       for(int i=12; i<handler.filenames.size(); i++) {
+       for(int i=11; i<14; i++) {
           assertNull(handler.filenames.get(i));
        }
        // But we do know their types
@@ -222,8 +222,8 @@ public class POIContainerExtractionTest 
        assertEquals(TYPE_JPG, handler.mediaTypes.get(8));  //    JPG inside .docx
        assertEquals(TYPE_PNG, handler.mediaTypes.get(9));  //    PNG inside .docx
        assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
-       assertEquals(TYPE_XLS, handler.mediaTypes.get(11)); // Embedded office doc
-       assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); //    PNG inside .xls
+       assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
+       assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); //    PNG inside .xls
        
        
        // PowerPoint with excel and word
@@ -262,4 +262,13 @@ public class POIContainerExtractionTest 
         assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
         assertEquals(2, handler.filenames.size());
     }
+
+    public void testPowerpointImages() throws Exception {
+        ContainerExtractor extractor = new ParserContainerExtractor();
+        TrackingHandler handler;
+
+        handler = process("pictures.ppt", extractor, false);
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
+        assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt?rev=1304294&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream