You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ma...@apache.org on 2011/03/21 14:02:02 UTC
svn commit: r1083766 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
test/resources/test-documents/headerPic.docx
Author: maxcom
Date: Mon Mar 21 13:02:02 2011
New Revision: 1083766
URL: http://svn.apache.org/viewvc?rev=1083766&view=rev
Log:
DOCX: rich text parsing for DOCX headers / footers
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/headerPic.docx (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java?rev=1083766&r1=1083765&r2=1083766&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java Mon Mar 21 13:02:02 2011
@@ -24,21 +24,7 @@ import org.apache.poi.openxml4j.opc.Pack
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
-import org.apache.poi.xwpf.usermodel.BodyType;
-import org.apache.poi.xwpf.usermodel.IBody;
-import org.apache.poi.xwpf.usermodel.IBodyElement;
-import org.apache.poi.xwpf.usermodel.XWPFDocument;
-import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
-import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
-import org.apache.poi.xwpf.usermodel.XWPFParagraph;
-import org.apache.poi.xwpf.usermodel.XWPFPicture;
-import org.apache.poi.xwpf.usermodel.XWPFPictureData;
-import org.apache.poi.xwpf.usermodel.XWPFRun;
-import org.apache.poi.xwpf.usermodel.XWPFStyle;
-import org.apache.poi.xwpf.usermodel.XWPFStyles;
-import org.apache.poi.xwpf.usermodel.XWPFTable;
-import org.apache.poi.xwpf.usermodel.XWPFTableCell;
-import org.apache.poi.xwpf.usermodel.XWPFTableRow;
+import org.apache.poi.xwpf.usermodel.*;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
@@ -215,30 +201,42 @@ public class XWPFWordExtractorDecorator
private void extractFooters(
XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
- throws SAXException {
+ throws SAXException, XmlException, IOException {
// footers
if (hfPolicy.getFirstPageFooter() != null) {
- xhtml.element("p", hfPolicy.getFirstPageFooter().getText());
+ extractHeaderText(xhtml, hfPolicy.getFirstPageFooter());
}
if (hfPolicy.getEvenPageFooter() != null) {
- xhtml.element("p", hfPolicy.getEvenPageFooter().getText());
+ extractHeaderText(xhtml, hfPolicy.getEvenPageFooter());
}
if (hfPolicy.getDefaultFooter() != null) {
- xhtml.element("p", hfPolicy.getDefaultFooter().getText());
+ extractHeaderText(xhtml, hfPolicy.getDefaultFooter());
}
}
private void extractHeaders(
XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy)
- throws SAXException {
+ throws SAXException, XmlException, IOException {
if (hfPolicy.getFirstPageHeader() != null) {
- xhtml.element("p", hfPolicy.getFirstPageHeader().getText());
+ extractHeaderText(xhtml, hfPolicy.getFirstPageHeader());
}
+
if (hfPolicy.getEvenPageHeader() != null) {
- xhtml.element("p", hfPolicy.getEvenPageHeader().getText());
+ extractHeaderText(xhtml, hfPolicy.getEvenPageHeader());
}
+
if (hfPolicy.getDefaultHeader() != null) {
- xhtml.element("p", hfPolicy.getDefaultHeader().getText());
+ extractHeaderText(xhtml, hfPolicy.getDefaultHeader());
+ }
+ }
+
+ private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header) throws SAXException, XmlException, IOException {
+ for(XWPFParagraph p : header.getParagraphs()) {
+ extractParagraph(p, xhtml);
+ }
+
+ for(XWPFTable table : header.getTables()) {
+ extractTable(table, xhtml);
}
}
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1083766&r1=1083765&r2=1083766&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java Mon Mar 21 13:02:02 2011
@@ -341,6 +341,37 @@ public class OOXMLParserTest extends Tes
}
/**
+ * Test that we can extract image from docx header
+ */
+ public void testWordPicturesInHeader() throws Exception {
+ InputStream input = null;
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+
+ StringWriter sw = new StringWriter();
+ SAXTransformerFactory factory = (SAXTransformerFactory)
+ SAXTransformerFactory.newInstance();
+ TransformerHandler handler = factory.newTransformerHandler();
+ handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.setResult(new StreamResult(sw));
+
+ // Try with a document containing various tables and formattings
+ input = OOXMLParserTest.class.getResourceAsStream("/test-documents/headerPic.docx");
+ try {
+ parser.parse(TikaInputStream.get(input), handler, metadata, context);
+ String xml = sw.toString();
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ metadata.get(Metadata.CONTENT_TYPE));
+ // Check that custom headings came through
+ assertTrue(xml.contains("<img"));
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
* Documents with some sheets are protected, but not all.
* See TIKA-364.
*/
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/headerPic.docx
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/headerPic.docx?rev=1083766&view=auto
==============================================================================
Binary file - no diff available.
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/headerPic.docx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream