You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/11 14:32:22 UTC
svn commit: r1033938 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
test/java/org/apache/tika/parser/microsoft/WordParserTest.java
test/resources/test-documents/Doc1_ole.doc
Author: maxcom
Date: Thu Nov 11 13:32:21 2010
New Revision: 1033938
URL: http://svn.apache.org/viewvc?rev=1033938&view=rev
Log:
Extract embedded Ole10Native files from POIFS
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/Doc1_ole.doc (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java?rev=1033938&r1=1033937&r2=1033938&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java Thu Nov 11 13:32:21 2010
@@ -16,17 +16,10 @@
*/
package org.apache.tika.parser.microsoft;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.poi.poifs.filesystem.DirectoryEntry;
-import org.apache.poi.poifs.filesystem.DocumentEntry;
-import org.apache.poi.poifs.filesystem.DocumentInputStream;
-import org.apache.poi.poifs.filesystem.Entry;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import java.io.*;
+
+import org.apache.poi.poifs.filesystem.*;
+import org.apache.poi.util.IOUtils;
import org.apache.tika.detect.ZipContainerDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
@@ -110,10 +103,31 @@ abstract class AbstractPOIFSExtractor {
// What kind of document is it?
Metadata metadata = new Metadata();
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
- metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- // Trigger for the document itself
- TikaInputStream embedded = TikaInputStream.get(tmpFile);
+ TikaInputStream embedded;
+
+ if (type==POIFSDocumentType.OLE10_NATIVE) {
+ Entry entry = dir.getEntry(Ole10Native.OLE10_NATIVE);
+ ByteArrayOutputStream bos = new ByteArrayOutputStream();
+ IOUtils.copy(new DocumentInputStream((DocumentEntry) entry), bos);
+ byte[] data = bos.toByteArray();
+
+ try {
+ Ole10Native ole = new Ole10Native(data, 0);
+ byte[] dataBuffer = ole.getDataBuffer();
+
+ metadata.set("resourceName", dir.getName() + '/' + ole.getLabel());
+
+ embedded = TikaInputStream.get(dataBuffer);
+ } catch (Ole10NativeException ex) {
+ embedded = TikaInputStream.get(data);
+ }
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+
+ embedded = TikaInputStream.get(tmpFile);
+ }
+
try {
if (extractor.shouldParseEmbedded(metadata)) {
extractor.parseEmbedded(embedded, xhtml, metadata, true);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1033938&r1=1033937&r2=1033938&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Thu Nov 11 13:32:21 2010
@@ -53,7 +53,21 @@ public class WordParserTest extends Test
input.close();
}
}
-
+
+ public void testWordWithWAV() throws Exception {
+ InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/Doc1_ole.doc");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertTrue(handler.toString().contains("MSj00974840000[1].wav"));
+ } finally {
+ input.close();
+ }
+ }
+
/**
* Test that the word converter is able to generate the
* correct HTML for the document
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/Doc1_ole.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/Doc1_ole.doc?rev=1033938&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/Doc1_ole.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream