You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/09 17:29:38 UTC
svn commit: r995462 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/WordExtractor.java
test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Author: nick
Date: Thu Sep 9 15:29:37 2010
New Revision: 995462
URL: http://svn.apache.org/viewvc?rev=995462&view=rev
Log:
Add support for extracting images embeded in Word .doc files - TIKA-509
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=995462&r1=995461&r2=995462&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Thu Sep 9 15:29:37 2010
@@ -18,11 +18,16 @@ package org.apache.tika.parser.microsoft
import java.io.FileNotFoundException;
import java.io.IOException;
+import java.util.List;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.model.PicturesTable;
+import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -36,8 +41,9 @@ public class WordExtractor extends Abstr
protected void parse(
POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ HWPFDocument document = new HWPFDocument(filesystem);
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
- new org.apache.poi.hwpf.extractor.WordExtractor(filesystem);
+ new org.apache.poi.hwpf.extractor.WordExtractor(document);
addTextIfAny(xhtml, "header", wordExtractor.getHeaderText());
@@ -59,6 +65,45 @@ public class WordExtractor extends Abstr
addTextIfAny(xhtml, "footer", wordExtractor.getFooterText());
+ // Handle any embeded images
+ PicturesTable pictureTable = document.getPicturesTable();
+ if(pictureTable != null) {
+ List<Picture> pictures = (List<Picture>)pictureTable.getAllPictures(); // TODO Generics fixed in newer version
+ for(Picture picture : pictures) {
+ // TODO When we have upgraded POI, we can use this code instead
+ //String mimeType = picture.getMimeType();
+
+ // This code is cut'n'paste from a newer version of POI
+ String mimeType = "image/unknown";
+ String extension = picture.suggestFileExtension();
+ if("jpg".equals(extension)) {
+ mimeType = "image/jpeg";
+ }
+ if("png".equals(extension)) {
+ mimeType = "image/png";
+ }
+ if("gif".equals(extension)) {
+ mimeType = "image/gif";
+ }
+ if("bmp".equals(extension)) {
+ mimeType = "image/bmp";
+ }
+ if("tiff".equals(extension)) {
+ mimeType = "image/tiff";
+ }
+ if("wmf".equals(extension)) {
+ mimeType = "application/x-wmf";
+ }
+ if("emf".equals(extension)) {
+ mimeType = "application/x-emf";
+ }
+
+ TikaInputStream stream = TikaInputStream.get(picture.getContent());
+ handleEmbededResource(stream, null, mimeType, xhtml);
+ }
+ }
+
+ // Handle any embeded office documents
try {
DirectoryEntry op =
(DirectoryEntry) filesystem.getRoot().getEntry("ObjectPool");
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=995462&r1=995461&r2=995462&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Thu Sep 9 15:29:37 2010
@@ -40,7 +40,7 @@ public class POIContainerExtractionTest
private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- private static final MediaType TYPE_JPG = MediaType.image("jpg");
+ private static final MediaType TYPE_JPG = MediaType.image("jpeg");
private static final MediaType TYPE_PNG = MediaType.image("png");
private static final MediaType TYPE_EMF = MediaType.application("x-emf");
@@ -84,15 +84,31 @@ public class POIContainerExtractionTest
assertEquals(null, handler.filenames.get(0));
assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
// PowerPoint with 2 images + sound
// TODO
// Word with 1 image
- // TODO
+ handler = process("testWORD_1img.doc", extractor, false);
+ assertEquals(1, handler.filenames.size());
+ assertEquals(1, handler.mediaTypes.size());
+
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+
// Word with 3 images
- // TODO
+ handler = process("testWORD_3imgs.doc", extractor, false);
+ assertEquals(3, handler.filenames.size());
+ assertEquals(3, handler.mediaTypes.size());
+
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(null, handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
}
/**
@@ -132,42 +148,65 @@ public class POIContainerExtractionTest
assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embeded office doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embeded office doc
+
// With recursion, should get the images embeded in the office files too
handler = process("testEXCEL_embeded.xls", extractor, true);
- // TODO
+ assertEquals(6, handler.filenames.size());
+ assertEquals(6, handler.mediaTypes.size());
+
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(null, handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals(null, handler.filenames.get(3));
+ assertEquals(null, handler.filenames.get(4));
+ assertEquals(null, handler.filenames.get(5));
+
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embeded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embeded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embeded office doc
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embeded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(5)); // PNG inside .doc
// Word with .docx, powerpoint and excel
handler = process("testWORD_embeded.doc", extractor, false);
- assertEquals(3, handler.filenames.size());
- assertEquals(3, handler.mediaTypes.size());
+ assertEquals(8, handler.filenames.size());
+ assertEquals(8, handler.mediaTypes.size());
// We don't know their filenames
- assertEquals(null, handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
+ for(String filename : handler.filenames)
+ assertEquals(null, filename);
// But we do know their types
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
- assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
- assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
+ assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embeded office doc?
+ assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(1)); // Icon of embeded office doc?
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(3)); // Embeded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embeded image
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(5)); // Embeded office doc
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(6)); // Embeded office doc
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(7)); // Embeded office doc
// With recursion, should get their images too
handler = process("testWORD_embeded.doc", extractor, true);
// TODO - Not all resources of embeded files are currently extracted
- assertEquals(4, handler.filenames.size());
- assertEquals(4, handler.mediaTypes.size());
+ assertEquals(9, handler.filenames.size());
+ assertEquals(9, handler.mediaTypes.size());
// We don't know their filenames
- assertEquals(null, handler.filenames.get(0));
- assertEquals(null, handler.filenames.get(1));
- assertEquals(null, handler.filenames.get(2));
- assertEquals(null, handler.filenames.get(3));
+ for(String filename : handler.filenames)
+ assertEquals(null, filename);
// But we do know their types
- assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
- assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
- assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
- assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // From xls
+ assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(0)); // Icon of embeded office doc?
+ assertEquals(MediaType.parse("image/unknown"), handler.mediaTypes.get(1)); // Icon of embeded office doc?
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+ assertEquals(TYPE_JPG, handler.mediaTypes.get(3)); // Embeded image
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(4)); // Embeded image
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(5)); // Embeded office doc
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(6)); // Embeded office doc
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(7)); // Embeded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(8)); // PNG inside .xls
// PowerPoint with excel and word
// TODO
@@ -182,8 +221,9 @@ public class POIContainerExtractionTest
}
private TrackingHandler process(String filename, ContainerExtractor extractor, boolean recurse) throws Exception {
- InputStream input = POIContainerExtractionTest.class.getResourceAsStream(
+ InputStream input = POIContainerExtractionTest.class.getResourceAsStream(
"/test-documents/" + filename);
+ assertNotNull(filename + " not found", input);
TikaInputStream stream = TikaInputStream.get(input);
assertEquals(true, extractor.isSupported(stream));