You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/09 16:11:47 UTC
svn commit: r995438 - in /tika/trunk:
tika-core/src/main/java/org/apache/tika/config/
tika-core/src/main/java/org/apache/tika/extractor/
tika-parsers/src/main/java/org/apache/tika/parser/microsoft/
tika-parsers/src/test/java/org/apache/tika/parser/micr...
Author: nick
Date: Thu Sep 9 14:11:46 2010
New Revision: 995438
URL: http://svn.apache.org/viewvc?rev=995438&view=rev
Log:
Support for container extraction of Images in .xls, and OOXML files embeded in OLE2 documents (TIKA-509)
Also rename ContainerEmbededResourceHandler to EmbededResourceHandler as suggested by Jukka, fix ParserContainerExtractor recursion, and remove ContainerExtractor from TikaConfig now we have ParserContainerExtractor.
Added:
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java
- copied, changed from r995359, tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java
Removed:
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java Thu Sep 9 14:11:46 2010
@@ -20,10 +20,8 @@ import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
-import java.util.List;
import java.util.Map;
import javax.imageio.spi.ServiceRegistry;
@@ -32,7 +30,6 @@ import javax.xml.parsers.DocumentBuilder
import javax.xml.parsers.ParserConfigurationException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
@@ -54,9 +51,6 @@ public class TikaConfig {
private final Map<MediaType, Parser> parsers =
new HashMap<MediaType, Parser>();
- private final List<ContainerExtractor> containerExtractors =
- new ArrayList<ContainerExtractor>();
-
private final MimeTypes mimeTypes;
public TikaConfig(String file)
@@ -254,10 +248,6 @@ public class TikaConfig {
return parsers;
}
- public List<ContainerExtractor> getContainerExtractors() {
- return containerExtractors;
- }
-
public MimeTypes getMimeRepository(){
return mimeTypes;
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java Thu Sep 9 14:11:46 2010
@@ -45,7 +45,7 @@ public interface ContainerExtractor exte
* Processes a container file, and extracts all the embeded
* resources from within it.
* <p>
- * The {@link ContainerEmbededResourceHandler} you supply will
+ * The {@link EmbededResourceHandler} you supply will
* be called for each embeded resource in the container. It is
* up to you whether you process the contents of the resource or not.
* <p>
@@ -66,6 +66,6 @@ public interface ContainerExtractor exte
*/
void extract(
TikaInputStream stream, ContainerExtractor recurseExtractor,
- ContainerEmbededResourceHandler handler)
+ EmbededResourceHandler handler)
throws IOException, TikaException;
}
Copied: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java (from r995359, tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java)
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java?p2=tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java&p1=tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java&r1=995359&r2=995438&rev=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ContainerEmbededResourceHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/EmbededResourceHandler.java Thu Sep 9 14:11:46 2010
@@ -25,7 +25,7 @@ import org.apache.tika.mime.MediaType;
* To work with a {@link ContainerExtractor}, your code needs
* to implement this interface.
*/
-public interface ContainerEmbededResourceHandler {
+public interface EmbededResourceHandler {
/**
* Called to process an embeded resource within the container.
* This will be called once per embeded resource within the
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java Thu Sep 9 14:11:46 2010
@@ -71,8 +71,8 @@ public class ParserContainerExtractor im
}
public void extract(
- TikaInputStream stream, ContainerExtractor recurseExtractor,
- final ContainerEmbededResourceHandler handler)
+ TikaInputStream stream, final ContainerExtractor recurseExtractor,
+ final EmbededResourceHandler handler)
throws IOException, TikaException {
ParseContext context = new ParseContext();
context.set(Parser.class, new Parser() {
@@ -82,9 +82,24 @@ public class ParserContainerExtractor im
public void parse(InputStream stream, ContentHandler ignored,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ // Figure out what we have to process
String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
- MediaType type = detector.detect(stream, metadata);
+ MediaType type;
+ if(metadata.get(Metadata.CONTENT_TYPE) != null) {
+ type = MediaType.parse( metadata.get(Metadata.CONTENT_TYPE) );
+ } else {
+ type = detector.detect(stream, metadata);
+ }
+
+ // Let the handler process the embeded resource
handler.handle(filename, type, stream);
+
+ // Recurse if requested
+ if(recurseExtractor != null) {
+ recurseExtractor.extract(
+ TikaInputStream.get(stream), recurseExtractor, handler
+ );
+ }
}
public void parse(InputStream stream, ContentHandler handler,
Metadata metadata) throws IOException, SAXException,
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Thu Sep 9 14:11:46 2010
@@ -17,6 +17,7 @@
package org.apache.tika.parser.microsoft;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -26,9 +27,11 @@ import org.apache.poi.poifs.filesystem.D
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.tika.detect.ZipContainerDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -44,6 +47,28 @@ abstract class AbstractPOIFSExtractor {
protected AbstractPOIFSExtractor(ParseContext context) {
this.context = context;
}
+
+ protected void handleEmbededResource(TikaInputStream resource,
+ String filename, String mediaType, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ try {
+ Metadata metadata = new Metadata();
+ if(filename != null) {
+ metadata.set(Metadata.TIKA_MIME_FILE, filename);
+ }
+ if(mediaType != null) {
+ metadata.set(Metadata.CONTENT_TYPE, mediaType);
+ }
+
+ Parser parser = context.get(Parser.class, EmptyParser.INSTANCE);
+ parser.parse(
+ resource, new EmbeddedContentHandler(xhtml),
+ metadata, context
+ );
+ } finally {
+ resource.close();
+ }
+ }
/**
* Handle an office document that's embedded at the POIFS level
@@ -51,6 +76,22 @@ abstract class AbstractPOIFSExtractor {
protected void handleEmbededOfficeDoc(
DirectoryEntry dir, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+ // Is it an embeded OLE2 document, or an embeded OOXML document?
+ try {
+ Entry ooxml = dir.getEntry("Package");
+
+ // It's OOXML
+ TikaInputStream ooxmlStream = TikaInputStream.get(
+ new DocumentInputStream((DocumentEntry)ooxml)
+ );
+ ZipContainerDetector detector = new ZipContainerDetector();
+ MediaType type = detector.detect(ooxmlStream, new Metadata());
+ handleEmbededResource(ooxmlStream, null, type.toString(), xhtml);
+ return;
+ } catch(FileNotFoundException e) {
+ // It's regular OLE2
+ }
+
// Need to dump the directory out to a new temp file, so
// it's stand along
POIFSFileSystem newFS = new POIFSFileSystem();
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java Thu Sep 9 14:11:46 2010
@@ -27,15 +27,22 @@ import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
+import org.apache.poi.ddf.EscherBSERecord;
+import org.apache.poi.ddf.EscherBitmapBlip;
+import org.apache.poi.ddf.EscherBlipRecord;
+import org.apache.poi.ddf.EscherMetafileBlip;
+import org.apache.poi.ddf.EscherRecord;
import org.apache.poi.hssf.eventusermodel.FormatTrackingHSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.record.AbstractEscherHolderRecord;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
+import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
import org.apache.poi.hssf.record.FormatRecord;
@@ -50,11 +57,13 @@ import org.apache.poi.hssf.record.SSTRec
import org.apache.poi.hssf.record.TextObjectRecord;
import org.apache.poi.hssf.record.chart.SeriesTextRecord;
import org.apache.poi.hssf.record.common.UnicodeString;
+import org.apache.poi.hssf.usermodel.HSSFPictureData;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.SAXException;
@@ -124,8 +133,8 @@ public class ExcelExtractor extends Abst
*/
protected void parse(
POIFSFileSystem filesystem, XHTMLContentHandler xhtml,
- Locale locale) throws IOException, SAXException {
- TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale);
+ Locale locale) throws IOException, SAXException, TikaException {
+ TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this);
listener.processFile(filesystem, isListenForAllRecords());
listener.throwStoredException();
@@ -152,6 +161,11 @@ public class ExcelExtractor extends Abst
* XHTML content handler to which the document content is rendered.
*/
private final XHTMLContentHandler handler;
+
+ /**
+ * The POIFS Extractor, used for embeded resources.
+ */
+ private final AbstractPOIFSExtractor extractor;
/**
* Potential exception thrown by the content handler. When set to
@@ -159,7 +173,7 @@ public class ExcelExtractor extends Abst
* ignored and the stored exception to be thrown when
* {@link #throwStoredException()} is invoked.
*/
- private SAXException exception = null;
+ private Exception exception = null;
private SSTRecord sstRecord;
@@ -201,6 +215,13 @@ public class ExcelExtractor extends Abst
* @see <a href="https://issues.apache.org/jira/browse/TIKA-103">TIKA-103</a>
*/
private final NumberFormat format;
+
+ /**
+ * These aren't complete when we first see them, as the
+ * depend on continue records that aren't always
+ * contiguous. Collect them for later processing.
+ */
+ private List<DrawingGroupRecord> drawingGroups = new ArrayList<DrawingGroupRecord>();
/**
* Construct a new listener instance outputting parsed data to
@@ -208,8 +229,9 @@ public class ExcelExtractor extends Abst
*
* @param handler Destination to write the parsed output to
*/
- private TikaHSSFListener(XHTMLContentHandler handler, Locale locale) {
+ private TikaHSSFListener(XHTMLContentHandler handler, Locale locale, AbstractPOIFSExtractor extractor) {
this.handler = handler;
+ this.extractor = extractor;
this.format = NumberFormat.getInstance(locale);
this.formatListener = new FormatTrackingHSSFListener(this, locale);
}
@@ -224,7 +246,7 @@ public class ExcelExtractor extends Abst
* @throws SAXException on any SAX parsing errors.
*/
public void processFile(POIFSFileSystem filesystem, boolean listenForAllRecords)
- throws IOException, SAXException {
+ throws IOException, SAXException, TikaException {
// Set up listener and register the records we want to process
HSSFRequest hssfRequest = new HSSFRequest();
@@ -247,6 +269,7 @@ public class ExcelExtractor extends Abst
hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener, ExtendedFormatRecord.sid);
+ hssfRequest.addListener(formatListener, DrawingGroupRecord.sid);
}
// Create event factory and process Workbook (fire events)
@@ -256,6 +279,13 @@ public class ExcelExtractor extends Abst
// Output any extra text that came after all the sheets
processExtraText();
+
+ // Look for embeded images, now that the drawing records
+ // have been fully matched with their continue data
+ for(DrawingGroupRecord dgr : drawingGroups) {
+ dgr.decode();
+ findPictures(dgr.getEscherRecords());
+ }
}
/**
@@ -267,19 +297,29 @@ public class ExcelExtractor extends Abst
if (exception == null) {
try {
internalProcessRecord(record);
- } catch (SAXException e) {
- exception = e;
+ } catch (TikaException te) {
+ exception = te;
+ } catch (IOException ie) {
+ exception = ie;
+ } catch (SAXException se) {
+ exception = se;
}
}
}
- public void throwStoredException() throws SAXException {
+ public void throwStoredException() throws TikaException, SAXException, IOException {
if (exception != null) {
- throw exception;
+ if(exception instanceof IOException)
+ throw (IOException)exception;
+ if(exception instanceof SAXException)
+ throw (SAXException)exception;
+ if(exception instanceof TikaException)
+ throw (TikaException)exception;
+ throw new TikaException(exception.getMessage());
}
}
- private void internalProcessRecord(Record record) throws SAXException {
+ private void internalProcessRecord(Record record) throws SAXException, TikaException, IOException {
switch (record.getSid()) {
case BOFRecord.sid: // start of workbook, worksheet etc. records
BOFRecord bof = (BOFRecord) record;
@@ -366,6 +406,13 @@ public class ExcelExtractor extends Abst
SeriesTextRecord str = (SeriesTextRecord) record;
addTextCell(record, str.getText());
break;
+
+ case DrawingGroupRecord.sid:
+ // Collect this now, we'll process later when all
+ // the continue records are in
+ drawingGroups.add( (DrawingGroupRecord)record );
+ break;
+
}
previousSid = record.getSid();
@@ -478,6 +525,55 @@ public class ExcelExtractor extends Abst
handler.endElement("div");
}
+ private void findPictures(List<EscherRecord> records) throws IOException, SAXException, TikaException {
+ for(EscherRecord escherRecord : records) {
+ if (escherRecord instanceof EscherBSERecord) {
+ EscherBlipRecord blip = ((EscherBSERecord) escherRecord).getBlipRecord();
+ if (blip != null) {
+ // TODO When we have upgraded POI, we can use this code instead
+ //HSSFPictureData picture = new HSSFPictureData(blip);
+ //String mimeType = picture.getMimeType();
+ //TikaInputStream stream = TikaInputStream.get(picture.getData());
+
+ // This code is cut'n'paste from a newer version of POI
+ String mimeType = "";
+ switch (blip.getRecordId()) {
+ case EscherMetafileBlip.RECORD_ID_WMF:
+ mimeType = "application/x-wmf";
+ break;
+ case EscherMetafileBlip.RECORD_ID_EMF:
+ mimeType = "application/x-emf";
+ break;
+ case EscherMetafileBlip.RECORD_ID_PICT:
+ mimeType = "image/x-pict";
+ break;
+ case EscherBitmapBlip.RECORD_ID_PNG:
+ mimeType = "image/png";
+ break;
+ case EscherBitmapBlip.RECORD_ID_JPEG:
+ mimeType = "image/jpeg";
+ break;
+ case EscherBitmapBlip.RECORD_ID_DIB:
+ mimeType = "image/bmp";
+ break;
+ default:
+ mimeType = "image/unknown";
+ break;
+ }
+ TikaInputStream stream = TikaInputStream.get(blip.getPicturedata());
+
+ // Handle the embeded resource
+ extractor.handleEmbededResource(
+ stream, null, mimeType,
+ handler
+ );
+ }
+ }
+
+ // Recursive call.
+ findPictures(escherRecord.getChildRecords());
+ }
+ }
}
/**
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java?rev=995438&r1=995437&r2=995438&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java Thu Sep 9 14:11:46 2010
@@ -22,8 +22,8 @@ import java.util.List;
import junit.framework.TestCase;
-import org.apache.tika.extractor.ContainerEmbededResourceHandler;
import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.EmbededResourceHandler;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.mime.MediaType;
@@ -36,6 +36,13 @@ public class POIContainerExtractionTest
private static final MediaType TYPE_DOC = MediaType.application("msword");
private static final MediaType TYPE_PPT = MediaType.application("vnd.ms-powerpoint");
private static final MediaType TYPE_XLS = MediaType.application("vnd.ms-excel");
+ private static final MediaType TYPE_DOCX = MediaType.application("vnd.openxmlformats-officedocument.wordprocessingml.document");
+ private static final MediaType TYPE_PPTX = MediaType.application("vnd.openxmlformats-officedocument.presentationml.presentation");
+ private static final MediaType TYPE_XLSX = MediaType.application("vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+
+ private static final MediaType TYPE_JPG = MediaType.image("jpg");
+ private static final MediaType TYPE_PNG = MediaType.image("png");
+ private static final MediaType TYPE_EMF = MediaType.application("x-emf");
/**
* For office files which don't have anything embeded in them
@@ -72,9 +79,11 @@ public class POIContainerExtractionTest
// Excel with 1 image
handler = process("testEXCEL_1img.xls", extractor, false);
- // TODO
- assertEquals(0, handler.filenames.size());
- assertEquals(0, handler.mediaTypes.size());
+ assertEquals(1, handler.filenames.size());
+ assertEquals(1, handler.mediaTypes.size());
+
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
// PowerPoint with 2 images + sound
// TODO
@@ -103,20 +112,27 @@ public class POIContainerExtractionTest
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
+
// Excel with a word doc and a powerpoint doc, both of which have images in them
- // Without recursion, should see both
+ // Without recursion, should see both documents + the images
handler = process("testEXCEL_embeded.xls", extractor, false);
- assertEquals(2, handler.filenames.size());
- assertEquals(2, handler.mediaTypes.size());
+ assertEquals(5, handler.filenames.size());
+ assertEquals(5, handler.mediaTypes.size());
// We don't know their filenames
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals(null, handler.filenames.get(3));
+ assertEquals(null, handler.filenames.get(4));
// But we do know their types
- assertEquals(TYPE_PPT, handler.mediaTypes.get(0));
- assertEquals(TYPE_DOC, handler.mediaTypes.get(1));
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(0)); // Icon of embeded office doc
+ assertEquals(TYPE_EMF, handler.mediaTypes.get(1)); // Icon of embeded office doc
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(2)); // Embeded image
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(3)); // Embeded office doc
+ assertEquals(TYPE_DOC, handler.mediaTypes.get(4)); // Embeded office doc
- // With recursion, should get their images too
+ // With recursion, should get the images embeded in the office files too
handler = process("testEXCEL_embeded.xls", extractor, true);
// TODO
@@ -131,14 +147,27 @@ public class POIContainerExtractionTest
assertEquals(null, handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
// But we do know their types
- assertEquals(MediaType.application("x-tika-msoffice"), handler.mediaTypes.get(0)); // TODO
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
+
// With recursion, should get their images too
handler = process("testWORD_embeded.doc", extractor, true);
- // TODO
+ // TODO - Not all resources of embeded files are currently extracted
+ assertEquals(4, handler.filenames.size());
+ assertEquals(4, handler.mediaTypes.size());
+ // We don't know their filenames
+ assertEquals(null, handler.filenames.get(0));
+ assertEquals(null, handler.filenames.get(1));
+ assertEquals(null, handler.filenames.get(2));
+ assertEquals(null, handler.filenames.get(3));
+ // But we do know their types
+ assertEquals(TYPE_DOCX, handler.mediaTypes.get(0));
+ assertEquals(TYPE_PPT, handler.mediaTypes.get(1));
+ assertEquals(TYPE_XLS, handler.mediaTypes.get(2));
+ assertEquals(TYPE_PNG, handler.mediaTypes.get(3)); // From xls
// PowerPoint with excel and word
// TODO
@@ -161,13 +190,17 @@ public class POIContainerExtractionTest
// Process it
TrackingHandler handler = new TrackingHandler();
- extractor.extract(stream, null, handler);
+ if(recurse) {
+ extractor.extract(stream, extractor, handler);
+ } else {
+ extractor.extract(stream, null, handler);
+ }
// So they can check what happened
return handler;
}
- private static class TrackingHandler implements ContainerEmbededResourceHandler {
+ private static class TrackingHandler implements EmbededResourceHandler {
private List<String> filenames = new ArrayList<String>();
private List<MediaType> mediaTypes = new ArrayList<MediaType>();