You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/05 14:01:28 UTC

svn commit: r1031545 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java test/resources/test-documents/footnotes.docx

Author: maxcom
Date: Fri Nov  5 13:01:28 2010
New Revision: 1031545

URL: http://svn.apache.org/viewvc?rev=1031545&view=rev
Log:
XWPFWordExtractorDecorator: extract text from footnotes

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx   (with props)
Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1031545&r1=1031544&r2=1031545&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Nov  5 13:01:28 2010
@@ -175,6 +175,11 @@ public class XWPFWordExtractorDecorator 
           xhtml.characters(commentText);
        }
 
+       String footnameText = paragraph.getFootnoteText();
+       if(footnameText != null && footnameText.length() > 0) {
+          xhtml.characters(footnameText + "\n");
+       }
+
        // Finish this paragraph
        xhtml.endElement(tag);
 

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1031545&r1=1031544&r2=1031545&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Nov  5 13:01:28 2010
@@ -245,6 +245,29 @@ public class OOXMLParserTest extends Tes
     }
 
     /**
+     * Test the plain text output of the Word converter
+     * @throws Exception
+     */
+    public void testWordFootnote() throws Exception {
+        InputStream input = OOXMLParserTest.class
+                .getResourceAsStream("/test-documents/footnotes.docx");
+
+        Metadata metadata = new Metadata();
+        ContentHandler handler = new BodyContentHandler();
+        ParseContext context = new ParseContext();
+
+        try {
+            parser.parse(TikaInputStream.get(input), handler, metadata, context);
+            assertEquals(
+                    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertTrue(handler.toString().contains("snoska"));
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
      * Test that the word converter is able to generate the
      *  correct HTML for the document
      */

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx?rev=1031545&view=auto
==============================================================================
Binary file - no diff available.

Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream