You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/14 22:16:28 UTC
svn commit: r997072 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/WordExtractor.java
test/java/org/apache/tika/parser/microsoft/WordParserTest.java
test/resources/test-documents/testWORD6.doc
Author: nick
Date: Tue Sep 14 20:16:27 2010
New Revision: 997072
URL: http://svn.apache.org/viewvc?rev=997072&view=rev
Log:
Enable word6 / word95 support via the new POI Word6Extractor class (TIKA-408)
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=997072&r1=997071&r2=997072&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Tue Sep 14 20:16:27 2010
@@ -21,6 +21,9 @@ import java.io.IOException;
import java.util.List;
import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
@@ -41,7 +44,13 @@ public class WordExtractor extends Abstr
protected void parse(
POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
- HWPFDocument document = new HWPFDocument(filesystem);
+ HWPFDocument document;
+ try {
+ document = new HWPFDocument(filesystem);
+ } catch(OldWordFileFormatException e) {
+ parseWord6(filesystem, xhtml);
+ return;
+ }
org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
new org.apache.poi.hwpf.extractor.WordExtractor(document);
@@ -134,5 +143,15 @@ public class WordExtractor extends Abstr
xhtml.endElement("div");
}
}
-
+
+ protected void parseWord6(
+ POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaException {
+ HWPFOldDocument doc = new HWPFOldDocument(filesystem);
+ Word6Extractor extractor = new Word6Extractor(doc);
+
+ for(String p : extractor.getParagraphText()) {
+ xhtml.element("p", p);
+ }
+ }
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=997072&r1=997071&r2=997072&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Tue Sep 14 20:16:27 2010
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft
import java.io.InputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
@@ -32,7 +33,7 @@ public class WordParserTest extends Test
try {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
- new OfficeParser().parse(input, handler, metadata);
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
assertEquals(
"application/msword",
@@ -45,4 +46,23 @@ public class WordParserTest extends Test
}
}
+ public void testWord6Parser() throws Exception {
+ InputStream input = WordParserTest.class.getResourceAsStream(
+ "/test-documents/testWORD6.doc");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
+ assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+ assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+ assertTrue(handler.toString().contains("The quick brown fox jumps over the lazy dog"));
+ } finally {
+ input.close();
+ }
+ }
}
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc?rev=997072&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc
------------------------------------------------------------------------------
svn:mime-type = application/msword