You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2010/09/14 22:16:28 UTC

svn commit: r997072 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/WordExtractor.java test/java/org/apache/tika/parser/microsoft/WordParserTest.java test/resources/test-documents/testWORD6.doc

Author: nick
Date: Tue Sep 14 20:16:27 2010
New Revision: 997072

URL: http://svn.apache.org/viewvc?rev=997072&view=rev
Log:
Enable word6 / word95 support via the new POI Word6Extractor class (TIKA-408)

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=997072&r1=997071&r2=997072&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Tue Sep 14 20:16:27 2010
@@ -21,6 +21,9 @@ import java.io.IOException;
 import java.util.List;
 
 import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFOldDocument;
+import org.apache.poi.hwpf.OldWordFileFormatException;
+import org.apache.poi.hwpf.extractor.Word6Extractor;
 import org.apache.poi.hwpf.model.PicturesTable;
 import org.apache.poi.hwpf.usermodel.Picture;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
@@ -41,7 +44,13 @@ public class WordExtractor extends Abstr
     protected void parse(
             POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
-        HWPFDocument document = new HWPFDocument(filesystem);
+        HWPFDocument document;
+        try {
+            document = new HWPFDocument(filesystem);
+        } catch(OldWordFileFormatException e) {
+            parseWord6(filesystem, xhtml);
+            return;
+        }
         org.apache.poi.hwpf.extractor.WordExtractor wordExtractor =
             new org.apache.poi.hwpf.extractor.WordExtractor(document);
 
@@ -134,5 +143,15 @@ public class WordExtractor extends Abstr
             xhtml.endElement("div");
         }
     }
-
+    
+    protected void parseWord6(
+            POIFSFileSystem filesystem, XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaException {
+        HWPFOldDocument doc = new HWPFOldDocument(filesystem);
+        Word6Extractor extractor = new Word6Extractor(doc);
+        
+        for(String p : extractor.getParagraphText()) {
+            xhtml.element("p", p);
+        }
+    }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=997072&r1=997071&r2=997072&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Tue Sep 14 20:16:27 2010
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft
 import java.io.InputStream;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.BodyContentHandler;
 import org.xml.sax.ContentHandler;
 
@@ -32,7 +33,7 @@ public class WordParserTest extends Test
         try {
             ContentHandler handler = new BodyContentHandler();
             Metadata metadata = new Metadata();
-            new OfficeParser().parse(input, handler, metadata);
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
 
             assertEquals(
                     "application/msword",
@@ -45,4 +46,23 @@ public class WordParserTest extends Test
         }
     }
 
+    public void testWord6Parser() throws Exception {
+        InputStream input = WordParserTest.class.getResourceAsStream(
+                "/test-documents/testWORD6.doc");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new ParseContext());
+
+            assertEquals(
+                    "application/msword",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("The quick brown fox jumps over the lazy dog", metadata.get(Metadata.TITLE));
+            assertEquals("Gym class featuring a brown fox and lazy dog", metadata.get(Metadata.SUBJECT));
+            assertEquals("Nevin Nollop", metadata.get(Metadata.AUTHOR));
+            assertTrue(handler.toString().contains("The quick brown fox jumps over the lazy dog"));
+        } finally {
+            input.close();
+        }
+    }
 }

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc?rev=997072&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD6.doc
------------------------------------------------------------------------------
    svn:mime-type = application/msword