You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/11/24 01:08:48 UTC
[tika] 01/03: Fix for TIKA-2347 Adds underline extraction from word
documents
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d64a32c63f376b9e003a4512adfec05414d4dfe6
Author: ’Stuart Hendren’ <‘stuart@committed.software’>
AuthorDate: Fri Apr 28 16:29:19 2017 +0100
Fix for TIKA-2347 Adds underline extraction from word documents
Extracts underline for both doc and docx and assigns tag <u>.
Given lowest nesting among style tags.
Adds tests using testWORD_various.doc and testWord_various.docx
Updates affected output in other WordParserTests.
---
.../tika/parser/microsoft/WordExtractor.java | 48 ++++++++++++++--------
.../ooxml/XWPFWordExtractorDecorator.java | 38 ++++++++++++++++-
.../tika/parser/microsoft/WordParserTest.java | 26 ++++++++----
.../parser/microsoft/ooxml/OOXMLParserTest.java | 11 +++++
4 files changed, 98 insertions(+), 25 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 569c881..4a80420 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -83,6 +83,7 @@ public class WordExtractor extends AbstractPOIFSExtractor {
private boolean curStrikeThrough;
private boolean curBold;
private boolean curItalic;
+ private boolean curUnderline;
private final Metadata metadata;
@@ -374,20 +375,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
}
- // Close any still open style tags
- if (curStrikeThrough) {
- xhtml.endElement("s");
- curStrikeThrough = false;
- }
- if (curItalic) {
- xhtml.endElement("i");
- curItalic = false;
- }
- if (curBold) {
- xhtml.endElement("b");
- curBold = false;
- }
-
+ closeStyleElements(false, xhtml);
+
xhtml.endElement(tas.getTag());
return 0;
@@ -401,7 +390,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
if (!skipStyling) {
if (cr.isBold() != curBold) {
- // Enforce nesting -- must close s and i tags
+ // Enforce nesting -- must close u, s and i tags
+ if (curUnderline) {
+ xhtml.endElement("u");
+ curUnderline = false;
+ }
if (curStrikeThrough) {
xhtml.endElement("s");
curStrikeThrough = false;
@@ -419,7 +412,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
if (cr.isItalic() != curItalic) {
- // Enforce nesting -- must close s tag
+ // Enforce nesting -- must close u and s tag
+ if (curUnderline) {
+ xhtml.endElement("u");
+ curUnderline = false;
+ }
if (curStrikeThrough) {
xhtml.endElement("s");
curStrikeThrough = false;
@@ -433,6 +430,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
if (cr.isStrikeThrough() != curStrikeThrough) {
+ // Enforce nesting -- must close u tag
+ if (curUnderline) {
+ xhtml.endElement("u");
+ curUnderline = false;
+ }
if (cr.isStrikeThrough()) {
xhtml.startElement("s");
} else {
@@ -440,6 +442,16 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
curStrikeThrough = cr.isStrikeThrough();
}
+
+ boolean isUnderline = cr.getUnderlineCode() != 0;
+ if (isUnderline != curUnderline) {
+ if (isUnderline) {
+ xhtml.startElement("u");
+ } else {
+ xhtml.endElement("u");
+ }
+ curUnderline = isUnderline;
+ }
}
// Clean up the text
@@ -550,6 +562,10 @@ public class WordExtractor extends AbstractPOIFSExtractor {
if (skipStyling) {
return;
}
+ if (curUnderline) {
+ xhtml.endElement("u");
+ curUnderline = false;
+ }
if (curStrikeThrough) {
xhtml.endElement("s");
curStrikeThrough = false;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 55a38fd..3c700a7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -35,6 +35,7 @@ import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.ICell;
import org.apache.poi.xwpf.usermodel.IRunElement;
import org.apache.poi.xwpf.usermodel.ISDTContent;
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
@@ -262,7 +263,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.endElement("a");
}
- TmpFormatting fmtg = new TmpFormatting(false, false);
+ TmpFormatting fmtg = new TmpFormatting(false, false, false);
//hyperlinks may or may not have hyperlink ids
String lastHyperlinkId = null;
@@ -366,6 +367,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.endElement("b");
fmtg.setBold(false);
}
+ if (fmtg.isUnderline()) {
+ xhtml.endElement("u");
+ fmtg.setUnderline(false);
+ }
return fmtg;
}
@@ -374,6 +379,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
throws SAXException, XmlException, IOException {
// True if we are currently in the named style tag:
if (run.isBold() != tfmtg.isBold()) {
+ if (tfmtg.isUnderline()) {
+ xhtml.endElement("u");
+ tfmtg.setUnderline(false);
+ }
if (tfmtg.isItalic()) {
xhtml.endElement("i");
tfmtg.setItalic(false);
@@ -387,6 +396,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
if (run.isItalic() != tfmtg.isItalic()) {
+ if (tfmtg.isUnderline()) {
+ xhtml.endElement("u");
+ tfmtg.setUnderline(false);
+ }
if (run.isItalic()) {
xhtml.startElement("i");
} else {
@@ -394,6 +407,16 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
tfmtg.setItalic(run.isItalic());
}
+
+ boolean isUnderline = run.getUnderline() != UnderlinePatterns.NONE;
+ if (isUnderline != tfmtg.isUnderline()) {
+ if (isUnderline) {
+ xhtml.startElement("u");
+ } else {
+ xhtml.endElement("u");
+ }
+ tfmtg.setUnderline(isUnderline);
+ }
if (config.getConcatenatePhoneticRuns()) {
xhtml.characters(run.toString());
@@ -526,10 +549,12 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private class TmpFormatting {
private boolean bold = false;
private boolean italic = false;
+ private boolean underline = false;
- private TmpFormatting(boolean bold, boolean italic) {
+ private TmpFormatting(boolean bold, boolean italic, boolean underline) {
this.bold = bold;
this.italic = italic;
+ this.underline = underline;
}
public boolean isBold() {
@@ -547,6 +572,15 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
public void setItalic(boolean italic) {
this.italic = italic;
}
+
+
+ public boolean isUnderline() {
+ return underline;
+ }
+
+ public void setUnderline(boolean underline) {
+ this.underline = underline;
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index b70ba72..01b81d7 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -29,6 +29,7 @@ import java.util.Locale;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.tika.TikaTest;
+
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -109,7 +110,7 @@ public class WordParserTest extends TikaTest {
assertTrue(xml.contains("<td>"));
// TODO - Check for the nested table
// Links
- assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+ assertTrue(xml.contains("<a href=\"http://tika.apache.org/\"><u>Tika</u></a>"));
// Paragraphs with other styles
assertTrue(xml.contains("<p class=\"signature\">This one"));
@@ -195,6 +196,17 @@ public class WordParserTest extends TikaTest {
assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
}
}
+
+ @Test
+ public void testTextDecoration() throws Exception {
+ XMLResult result = getXML("testWORD_various.doc");
+ String xml = result.xml;
+
+ assertTrue(xml.contains("<b>Bold</b>"));
+ assertTrue(xml.contains("<i>italic</i>"));
+ assertTrue(xml.contains("<u>underline</u>"));
+
+ }
//TIKA-2346
@Test
@@ -383,15 +395,15 @@ public class WordParserTest extends TikaTest {
assertFalse(xml.contains("HYPERLINK"));
// Check we do have the link
- assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
+ assertContains("<a href=\"http://tw-systemhaus.de\"><u>http:", xml);
// Check we do have the email
- assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
+ assertContains("<a href=\"mailto:ab@example.com\"><u>ab@", xml);
}
@Test
public void testControlCharacter() throws Exception {
- assertContains("1. Introduzione<b> </b></a> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+ assertContains("<u>1.</u> <u>Introduzione</u><b> </b></a><u> </u></p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
}
@Test
@@ -405,7 +417,7 @@ public class WordParserTest extends TikaTest {
"application/msword",
metadata.get(Metadata.CONTENT_TYPE));
- assertContains("<p>1. Organisering av vakten:</p>", xml);
+ assertContains("<p><u>1. Organisering av vakten:</u></p>", xml);
}
@@ -543,8 +555,8 @@ public class WordParserTest extends TikaTest {
//TIKA-1255
String xml = getXML("testWORD_boldHyperlink.doc").xml;
xml = xml.replaceAll("\\s+", " ");
- assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
- assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+ assertContains("<a href=\"http://tika.apache.org/\"><u>hyper </u><b><u>link</u></b></a>", xml);
+ assertContains("<a href=\"http://tika.apache.org/\"><b><u>hyper</u></b><u> link</u></a>; bold" , xml);
}
@Test
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index b8b3dd1..7732d40 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -600,6 +600,17 @@ public class OOXMLParserTest extends TikaTest {
assertEquals("Should have found some text", false, handler.toString().isEmpty());
}
}
+
+ @Test
+ public void testTextDecoration() throws Exception {
+ XMLResult result = getXML("testWORD_various.docx");
+ String xml = result.xml;
+
+ assertTrue(xml.contains("<b>Bold</b>"));
+ assertTrue(xml.contains("<i>italic</i>"));
+ assertTrue(xml.contains("<u>underline</u>"));
+
+ }
@Test
public void testVarious() throws Exception {
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.