You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2012/03/23 13:05:22 UTC
svn commit: r1304294 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
test/resources/test-documents/pictures.ppt
Author: maxcom
Date: Fri Mar 23 12:05:21 2012
New Revision: 1304294
URL: http://svn.apache.org/viewvc?rev=1304294&view=rev
Log:
TIKA-883 - Extract embedded images in PPT
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java?rev=1304294&r1=1304293&r2=1304294&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java Fri Mar 23 12:05:21 2012
@@ -16,18 +16,10 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.IOException;
-import java.util.HashSet;
-
import org.apache.poi.hslf.HSLFSlideShow;
-import org.apache.poi.hslf.model.Comment;
-import org.apache.poi.hslf.model.HeadersFooters;
-import org.apache.poi.hslf.model.Notes;
-import org.apache.poi.hslf.model.OLEShape;
-import org.apache.poi.hslf.model.Shape;
-import org.apache.poi.hslf.model.Slide;
-import org.apache.poi.hslf.model.TextRun;
+import org.apache.poi.hslf.model.*;
import org.apache.poi.hslf.usermodel.ObjectData;
+import org.apache.poi.hslf.usermodel.PictureData;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
@@ -37,6 +29,9 @@ import org.apache.tika.parser.ParseConte
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
+import java.io.IOException;
+import java.util.HashSet;
+
public class HSLFExtractor extends AbstractPOIFSExtractor {
public HSLFExtractor(ParseContext context) {
super(context);
@@ -164,6 +159,8 @@ public class HSLFExtractor extends Abstr
}
}
+ handleSlideEmbeddedPictures(_show, xhtml);
+
xhtml.endElement("div");
}
@@ -181,7 +178,36 @@ public class HSLFExtractor extends Abstr
}
}
- private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
+ private void handleSlideEmbeddedPictures(SlideShow slideshow, XHTMLContentHandler xhtml)
+ throws TikaException, SAXException, IOException {
+ for (PictureData pic : slideshow.getPictureData()) {
+ String mediaType = null;
+
+ switch (pic.getType()) {
+ case Picture.EMF:
+ mediaType = "application/x-emf";
+ break;
+ case Picture.JPEG:
+ mediaType = "image/jpeg";
+ break;
+ case Picture.PNG:
+ mediaType = "image/png";
+ break;
+ case Picture.WMF:
+ mediaType = "application/x-msmetafile";
+ break;
+ case Picture.DIB:
+ mediaType = "image/bmp";
+ break;
+ }
+
+ handleEmbeddedResource(
+ TikaInputStream.get(pic.getData()), null,
+ mediaType, xhtml, false);
+ }
+ }
+
+ private void handleSlideEmbeddedResources(Slide slide, XHTMLContentHandler xhtml)
throws TikaException, SAXException, IOException {
Shape[] shapes;
try {
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=1304294&r1=1304293&r2=1304294&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Fri Mar 23 12:05:21 2012
@@ -134,8 +134,8 @@ public class POIContainerExtractionTest
// With recursion, should get the images embedded in the office files too
handler = process("testEXCEL_embeded.xls", extractor, true);
- assertEquals(12, handler.filenames.size());
- assertEquals(12, handler.mediaTypes.size());
+ assertEquals(17, handler.filenames.size());
+ assertEquals(17, handler.mediaTypes.size());
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
@@ -147,7 +147,7 @@ public class POIContainerExtractionTest
assertEquals("image1.png", handler.filenames.get(7));
assertEquals("image2.jpg", handler.filenames.get(8));
assertEquals("image3.png", handler.filenames.get(9));
- assertEquals("image1.png", handler.filenames.get(11));
+ assertEquals("image1.png", handler.filenames.get(16));
assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embedded office doc
@@ -159,8 +159,8 @@ public class POIContainerExtractionTest
assertEquals(TYPE_PNG, handler.mediaTypes.get(7)); // Embedded image
assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // Embedded image
- assertEquals(TYPE_DOC, handler.mediaTypes.get(10)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(11)); // Embedded image
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(15)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(16)); // Embedded image
// Word with .docx, powerpoint and excel
handler = process("testWORD_embeded.doc", extractor, false);
@@ -193,8 +193,8 @@ public class POIContainerExtractionTest
// With recursion, should get their images too
handler = process("testWORD_embeded.doc", extractor, true);
- assertEquals(13, handler.filenames.size());
- assertEquals(13, handler.mediaTypes.size());
+ assertEquals(16, handler.filenames.size());
+ assertEquals(16, handler.mediaTypes.size());
// We don't know their filenames, except for doc images + docx
assertEquals("image1.emf", handler.filenames.get(0));
@@ -207,7 +207,7 @@ public class POIContainerExtractionTest
assertEquals("image2.png", handler.filenames.get(7));
assertEquals("image3.jpeg", handler.filenames.get(8));
assertEquals("image4.png", handler.filenames.get(9));
- for(int i=12; i<handler.filenames.size(); i++) {
+ for(int i=11; i<14; i++) {
assertNull(handler.filenames.get(i));
}
// But we do know their types
@@ -222,8 +222,8 @@ public class POIContainerExtractionTest
assertEquals(TYPE_JPG, handler.mediaTypes.get(8)); // JPG inside .docx
assertEquals(TYPE_PNG, handler.mediaTypes.get(9)); // PNG inside .docx
assertEquals(TYPE_PPT, handler.mediaTypes.get(10)); // Embedded office doc
- assertEquals(TYPE_XLS, handler.mediaTypes.get(11)); // Embedded office doc
- assertEquals(TYPE_PNG, handler.mediaTypes.get(12)); // PNG inside .xls
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(14)); // Embedded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(15)); // PNG inside .xls
// PowerPoint with excel and word
@@ -262,4 +262,13 @@ public class POIContainerExtractionTest
assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
assertEquals(2, handler.filenames.size());
}
+
+ public void testPowerpointImages() throws Exception {
+ ContainerExtractor extractor = new ParserContainerExtractor();
+ TrackingHandler handler;
+
+ handler = process("pictures.ppt", extractor, false);
+ assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
+ assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt?rev=1304294&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/pictures.ppt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream