You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/08/21 15:26:34 UTC
svn commit: r1159980 - in /tika/trunk/tika-parsers/src:
main/java/org/apache/tika/parser/microsoft/
test/java/org/apache/tika/parser/microsoft/ test/resources/test-documents/
Author: jukka
Date: Sun Aug 21 13:26:34 2011
New Revision: 1159980
URL: http://svn.apache.org/viewvc?rev=1159980&view=rev
Log:
TIKA-692: TikaCLI -x or -h on a Word doc sometimes adds newline after </b> tag
Patch by Michael McCandless
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc (with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc (with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java?rev=1159980&r1=1159979&r2=1159980&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java Sun Aug 21 13:26:34 2011
@@ -54,6 +54,11 @@ public class WordExtractor extends Abstr
super(context);
}
+ // True if we are currently in the named style tag:
+ private boolean curStrikeThrough;
+ private boolean curBold;
+ private boolean curItalic;
+
protected void parse(
NPOIFSFileSystem filesystem, XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
@@ -187,6 +192,20 @@ public class WordExtractor extends Abstr
}
}
+ // Close any still open style tags
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (curBold) {
+ xhtml.endElement("b");
+ curBold = false;
+ }
+
xhtml.endElement(tas.getTag());
return 0;
@@ -198,14 +217,47 @@ public class WordExtractor extends Abstr
if(cr.text().equals("\r"))
return;
- List<String> tags = new ArrayList<String>();
if(!skipStyling) {
- if(cr.isBold()) tags.add("b");
- if(cr.isItalic()) tags.add("i");
- if(cr.isStrikeThrough()) tags.add("s");
- for(String tag : tags) {
- xhtml.startElement(tag);
- }
+ if (cr.isBold() != curBold) {
+ // Enforce nesting -- must close s and i tags
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (cr.isBold()) {
+ xhtml.startElement("b");
+ } else {
+ xhtml.endElement("b");
+ }
+ curBold = cr.isBold();
+ }
+
+ if (cr.isItalic() != curItalic) {
+ // Enforce nesting -- must close s tag
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (cr.isItalic()) {
+ xhtml.startElement("i");
+ } else {
+ xhtml.endElement("i");
+ }
+ curItalic = cr.isItalic();
+ }
+
+ if (cr.isStrikeThrough() != curStrikeThrough) {
+ if (cr.isStrikeThrough()) {
+ xhtml.startElement("s");
+ } else {
+ xhtml.endElement("s");
+ }
+ curStrikeThrough = cr.isStrikeThrough();
+ }
}
// Clean up the text
@@ -217,10 +269,6 @@ public class WordExtractor extends Abstr
}
xhtml.characters(text);
-
- for(int tn=tags.size()-1; tn>=0; tn--) {
- xhtml.endElement(tags.get(tn));
- }
}
/**
* Can be \13..text..\15 or \13..control..\14..text..\15 .
Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java?rev=1159980&r1=1159979&r2=1159980&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java Sun Aug 21 13:26:34 2011
@@ -68,11 +68,17 @@ public class WordParserTest extends Test
}
}
- /**
- * Test that the word converter is able to generate the
- * correct HTML for the document
- */
- public void testWordHTML() throws Exception {
+ private static class XMLResult {
+ public final String xml;
+ public final Metadata metadata;
+
+ public XMLResult(String xml, Metadata metadata) {
+ this.xml = xml;
+ this.metadata = metadata;
+ }
+ }
+
+ private XMLResult getXML(String filePath) throws Exception {
InputStream input = null;
Metadata metadata = new Metadata();
@@ -81,59 +87,83 @@ public class WordParserTest extends Test
SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
- handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
+ handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
handler.setResult(new StreamResult(sw));
// Try with a document containing various tables and formattings
- input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD.doc");
+ input = OOXMLParserTest.class.getResourceAsStream(filePath);
try {
new OfficeParser().parse(input, handler, metadata, new ParseContext());
- String xml = sw.toString();
- assertEquals(
- "application/msword",
- metadata.get(Metadata.CONTENT_TYPE));
- assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
- assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
- assertTrue(xml.contains("Sample Word Document"));
-
- // Check that custom headings came through
- assertTrue(xml.contains("<h1 class=\"title\">"));
- // Regular headings
- assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
- assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
- // Bold and italic
- assertTrue(xml.contains("<b>BOLD</b>"));
- assertTrue(xml.contains("<i>ITALIC</i>"));
- // Table
- assertTrue(xml.contains("<table>"));
- assertTrue(xml.contains("<td>"));
- // TODO - Check for the nested table
- // Links
- assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
- // Paragraphs with other styles
- assertTrue(xml.contains("<p class=\"signature\">This one"));
+ return new XMLResult(sw.toString(), metadata);
} finally {
input.close();
}
+ }
+
+ /**
+ * Test that the word converter is able to generate the
+ * correct HTML for the document
+ */
+ public void testWordHTML() throws Exception {
+
+ // Try with a document containing various tables and
+ // formattings
+ XMLResult result = getXML("/test-documents/testWORD.doc");
+ String xml = result.xml;
+ Metadata metadata = result.metadata;
+
+ assertEquals(
+ "application/msword",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Sample Word Document", metadata.get(Metadata.TITLE));
+ assertEquals("Keith Bennett", metadata.get(Metadata.AUTHOR));
+ assertTrue(xml.contains("Sample Word Document"));
+
+ // Check that custom headings came through
+ assertTrue(xml.contains("<h1 class=\"title\">"));
+ // Regular headings
+ assertTrue(xml.contains("<h1>Heading Level 1</h1>"));
+ assertTrue(xml.contains("<h3>Heading Level 3</h3>"));
+ // Bold and italic
+ assertTrue(xml.contains("<b>BOLD</b>"));
+ assertTrue(xml.contains("<i>ITALIC</i>"));
+ // Table
+ assertTrue(xml.contains("<table>"));
+ assertTrue(xml.contains("<td>"));
+ // TODO - Check for the nested table
+ // Links
+ assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ // Paragraphs with other styles
+ assertTrue(xml.contains("<p class=\"signature\">This one"));
// Try with a document that contains images
- sw = new StringWriter();
- handler.setResult(new StreamResult(sw));
- input = OOXMLParserTest.class.getResourceAsStream("/test-documents/testWORD_3imgs.doc");
- try {
- new OfficeParser().parse(TikaInputStream.get(input), handler, metadata, new ParseContext());
- String xml = sw.toString();
+ xml = getXML("/test-documents/testWORD_3imgs.doc").xml;
+
+ // Images 1-3
+ assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
+ assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\""));
+ assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\""));
- // Images 1-3
- assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image1.png\""));
- assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image2.jpg\""));
- assertTrue("Image not found in:\n"+xml, xml.contains("src=\"embedded:image3.png\""));
+ // Text too
+ assertTrue(xml.contains("<p>The end!"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("/test-documents/testWORD_bold_character_runs.doc").xml;
+
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
+
+ // TIKA-692: test document containing multiple
+ // character runs within a bold tag:
+ xml = getXML("/test-documents/testWORD_bold_character_runs2.doc").xml;
- // Text too
- assertTrue(xml.contains("<p>The end!"));
- } finally {
- input.close();
- }
+ // Make sure bold text arrived as single
+ // contiguous string even though Word parser
+ // handled this as 3 character runs
+ assertTrue("Bold text wasn't contiguous: "+xml, xml.contains("F<b>oob</b>a<b>r</b>"));
}
public void testWord6Parser() throws Exception {
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc?rev=1159980&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc Sun Aug 21 13:26:34 2011 differ
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs.doc
------------------------------------------------------------------------------
svn:executable = *
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc?rev=1159980&view=auto
==============================================================================
Files tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc (added) and tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc Sun Aug 21 13:26:34 2011 differ
Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testWORD_bold_character_runs2.doc
------------------------------------------------------------------------------
svn:executable = *