You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/11/24 01:08:49 UTC
[tika] 02/03: TIKA-2347 - Added extraction of element in
DOCX files
This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 93cbed6df993ef01e59c55b86449b664e9052cae
Author: David Meikle <da...@meikle.io>
AuthorDate: Fri Nov 24 01:01:37 2017 +0000
TIKA-2347 - Added extraction of <strike> element in DOCX files
---
.../ooxml/XWPFWordExtractorDecorator.java | 42 +++++++++++++++++++--
.../tika/parser/microsoft/WordParserTest.java | 11 ++++++
.../parser/microsoft/ooxml/OOXMLParserTest.java | 10 +++++
.../resources/test-documents/testWORD_various.doc | Bin 35328 -> 17408 bytes
.../resources/test-documents/testWORD_various.docx | Bin 19169 -> 14470 bytes
5 files changed, 60 insertions(+), 3 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 3c700a7..25c5a7c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -263,7 +263,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.endElement("a");
}
- TmpFormatting fmtg = new TmpFormatting(false, false, false);
+ TmpFormatting fmtg = new TmpFormatting(false, false, false, false);
//hyperlinks may or may not have hyperlink ids
String lastHyperlinkId = null;
@@ -371,6 +371,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
xhtml.endElement("u");
fmtg.setUnderline(false);
}
+ if (fmtg.isStrikeThrough()) {
+ xhtml.endElement("strike");
+ fmtg.setStrikeThrough(false);
+ }
return fmtg;
}
@@ -379,6 +383,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
throws SAXException, XmlException, IOException {
// True if we are currently in the named style tag:
if (run.isBold() != tfmtg.isBold()) {
+ if (tfmtg.isStrikeThrough()) {
+ xhtml.endElement("strike");
+ tfmtg.setStrikeThrough(false);
+ }
if (tfmtg.isUnderline()) {
xhtml.endElement("u");
tfmtg.setUnderline(false);
@@ -396,6 +404,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
if (run.isItalic() != tfmtg.isItalic()) {
+ if (tfmtg.isStrikeThrough()) {
+ xhtml.endElement("strike");
+ tfmtg.setStrikeThrough(false);
+ }
if (tfmtg.isUnderline()) {
xhtml.endElement("u");
tfmtg.setUnderline(false);
@@ -407,7 +419,20 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
tfmtg.setItalic(run.isItalic());
}
-
+
+ if (run.isStrikeThrough() != tfmtg.isStrikeThrough()) {
+ if (tfmtg.isUnderline()) {
+ xhtml.endElement("u");
+ tfmtg.setUnderline(false);
+ }
+ if (run.isStrikeThrough()) {
+ xhtml.startElement("strike");
+ } else {
+ xhtml.endElement("strike");
+ }
+ tfmtg.setStrikeThrough(run.isStrikeThrough());
+ }
+
boolean isUnderline = run.getUnderline() != UnderlinePatterns.NONE;
if (isUnderline != tfmtg.isUnderline()) {
if (isUnderline) {
@@ -550,11 +575,15 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
private boolean bold = false;
private boolean italic = false;
private boolean underline = false;
+ private boolean strikeThrough = false;
- private TmpFormatting(boolean bold, boolean italic, boolean underline) {
+
+ private TmpFormatting(boolean bold, boolean italic, boolean underline,
+ boolean strikeThrough) {
this.bold = bold;
this.italic = italic;
this.underline = underline;
+ this.strikeThrough = strikeThrough;
}
public boolean isBold() {
@@ -582,6 +611,13 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
this.underline = underline;
}
+ public boolean isStrikeThrough() {
+ return strikeThrough;
+ }
+
+ public void setStrikeThrough(boolean strikeThrough) {
+ this.strikeThrough = strikeThrough;
+ }
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 01b81d7..31bd8ba 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -208,6 +208,17 @@ public class WordParserTest extends TikaTest {
}
+ @Test
+ public void testTextDecorationNested() throws Exception {
+ XMLResult result = getXML("testWORD_various.doc");
+ String xml = result.xml;
+
+ assertTrue(xml.contains("<i>ita<s>li</s>c</i>"));
+ assertTrue(xml.contains("<i>ita<s>l<u>i</u></s>c</i>"));
+ assertTrue(xml.contains("<i><u>unde</u><s><u>r</u></s><u>line</u></i>"));
+ }
+
+
//TIKA-2346
@Test
public void testTurningOffTextBox() throws Exception {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 7732d40..7dc455f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -609,7 +609,17 @@ public class OOXMLParserTest extends TikaTest {
assertTrue(xml.contains("<b>Bold</b>"));
assertTrue(xml.contains("<i>italic</i>"));
assertTrue(xml.contains("<u>underline</u>"));
+ assertTrue(xml.contains("<strike>strikethrough</strike>"));
+ }
+
+ @Test
+ public void testTextDecorationNested() throws Exception {
+ XMLResult result = getXML("testWORD_various.docx");
+ String xml = result.xml;
+ assertTrue(xml.contains("<i>ita<strike>li</strike>c</i>"));
+ assertTrue(xml.contains("<i>ita<strike>l<u>i</u></strike>c</i>"));
+ assertTrue(xml.contains("<i><u>unde</u><strike><u>r</u></strike><u>line</i></u>"));
}
@Test
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_various.doc b/tika-parsers/src/test/resources/test-documents/testWORD_various.doc
index a2ad236..4341c22 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testWORD_various.doc and b/tika-parsers/src/test/resources/test-documents/testWORD_various.doc differ
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_various.docx b/tika-parsers/src/test/resources/test-documents/testWORD_various.docx
index 24d9e63..e4122a0 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testWORD_various.docx and b/tika-parsers/src/test/resources/test-documents/testWORD_various.docx differ
--
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.