You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2010/11/05 14:01:28 UTC
svn commit: r1031545 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
test/resources/test-documents/footnotes.docx
Author: maxcom
Date: Fri Nov 5 13:01:28 2010
New Revision: 1031545
URL: http://svn.apache.org/viewvc?rev=1031545&view=rev
Log:
XWPFWordExtractorDecorator: extract text from footnotes
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1031545&r1=1031544&r2=1031545&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Fri Nov 5 13:01:28 2010
@@ -175,6 +175,11 @@ public class XWPFWordExtractorDecorator
xhtml.characters(commentText);
}
+ String footnameText = paragraph.getFootnoteText();
+ if(footnameText != null && footnameText.length() > 0) {
+ xhtml.characters(footnameText + "\n");
+ }
+
// Finish this paragraph
xhtml.endElement(tag);
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1031545&r1=1031544&r2=1031545&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Fri Nov 5 13:01:28 2010
@@ -245,6 +245,29 @@ public class OOXMLParserTest extends Tes
}
/**
+ * Test the plain text output of the Word converter
+ * @throws Exception
+ */
+ public void testWordFootnote() throws Exception {
+ InputStream input = OOXMLParserTest.class
+ .getResourceAsStream("/test-documents/footnotes.docx");
+
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+
+ try {
+ parser.parse(TikaInputStream.get(input), handler, metadata, context);
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertTrue(handler.toString().contains("snoska"));
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
* Test that the word converter is able to generate the
* correct HTML for the document
*/
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx?rev=1031545&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/footnotes.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream