You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by dm...@apache.org on 2017/11/24 01:08:49 UTC

[tika] 02/03: TIKA-2347 - Added extraction of element in DOCX files

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 93cbed6df993ef01e59c55b86449b664e9052cae
Author: David Meikle <da...@meikle.io>
AuthorDate: Fri Nov 24 01:01:37 2017 +0000

    TIKA-2347 - Added extraction of <strike> element in DOCX files
---
 .../ooxml/XWPFWordExtractorDecorator.java          |  42 +++++++++++++++++++--
 .../tika/parser/microsoft/WordParserTest.java      |  11 ++++++
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  10 +++++
 .../resources/test-documents/testWORD_various.doc  | Bin 35328 -> 17408 bytes
 .../resources/test-documents/testWORD_various.docx | Bin 19169 -> 14470 bytes
 5 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 3c700a7..25c5a7c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -263,7 +263,7 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             xhtml.endElement("a");
         }
 
-        TmpFormatting fmtg = new TmpFormatting(false, false, false);
+        TmpFormatting fmtg = new TmpFormatting(false, false, false, false);
 
         //hyperlinks may or may not have hyperlink ids
         String lastHyperlinkId = null;
@@ -371,6 +371,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         	xhtml.endElement("u");
         	fmtg.setUnderline(false);
         }
+        if (fmtg.isStrikeThrough()) {
+            xhtml.endElement("strike");
+            fmtg.setStrikeThrough(false);
+        }
         return fmtg;
     }
 
@@ -379,6 +383,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             throws SAXException, XmlException, IOException {
         // True if we are currently in the named style tag:
         if (run.isBold() != tfmtg.isBold()) {
+            if (tfmtg.isStrikeThrough()) {
+                xhtml.endElement("strike");
+                tfmtg.setStrikeThrough(false);
+            }
             if (tfmtg.isUnderline()) {
                 xhtml.endElement("u");
                 tfmtg.setUnderline(false);
@@ -396,6 +404,10 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         }
 
         if (run.isItalic() != tfmtg.isItalic()) {
+            if (tfmtg.isStrikeThrough()) {
+                xhtml.endElement("strike");
+                tfmtg.setStrikeThrough(false);
+            }
             if (tfmtg.isUnderline()) {
                 xhtml.endElement("u");
                 tfmtg.setUnderline(false);
@@ -407,7 +419,20 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             }
             tfmtg.setItalic(run.isItalic());
         }
-        
+
+        if (run.isStrikeThrough() != tfmtg.isStrikeThrough()) {
+            if (tfmtg.isUnderline()) {
+                xhtml.endElement("u");
+                tfmtg.setUnderline(false);
+            }
+            if (run.isStrikeThrough()) {
+                xhtml.startElement("strike");
+            } else {
+                xhtml.endElement("strike");
+            }
+            tfmtg.setStrikeThrough(run.isStrikeThrough());
+        }
+
         boolean isUnderline = run.getUnderline() != UnderlinePatterns.NONE;
         if (isUnderline != tfmtg.isUnderline()) {
             if (isUnderline) {
@@ -550,11 +575,15 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
         private boolean bold = false;
         private boolean italic = false;
         private boolean underline = false;
+        private boolean strikeThrough = false;
 
-        private TmpFormatting(boolean bold, boolean italic, boolean underline) {
+
+        private TmpFormatting(boolean bold, boolean italic, boolean underline,
+                              boolean strikeThrough) {
             this.bold = bold;
             this.italic = italic;
             this.underline = underline;
+            this.strikeThrough = strikeThrough;
         }
 
         public boolean isBold() {
@@ -582,6 +611,13 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             this.underline = underline;
         }
 
+        public boolean isStrikeThrough() {
+            return strikeThrough;
+        }
+
+        public void setStrikeThrough(boolean strikeThrough) {
+            this.strikeThrough = strikeThrough;
+        }
     }
 
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index 01b81d7..31bd8ba 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -208,6 +208,17 @@ public class WordParserTest extends TikaTest {
 
     }
 
+    @Test
+    public void testTextDecorationNested() throws Exception {
+        XMLResult result = getXML("testWORD_various.doc");
+        String xml = result.xml;
+
+        assertTrue(xml.contains("<i>ita<s>li</s>c</i>"));
+        assertTrue(xml.contains("<i>ita<s>l<u>i</u></s>c</i>"));
+        assertTrue(xml.contains("<i><u>unde</u><s><u>r</u></s><u>line</u></i>"));
+    }
+
+
     //TIKA-2346
     @Test
     public void testTurningOffTextBox() throws Exception {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 7732d40..7dc455f 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -609,7 +609,17 @@ public class OOXMLParserTest extends TikaTest {
       assertTrue(xml.contains("<b>Bold</b>"));
       assertTrue(xml.contains("<i>italic</i>"));
       assertTrue(xml.contains("<u>underline</u>"));
+      assertTrue(xml.contains("<strike>strikethrough</strike>"));
+    }
+
+    @Test
+    public void testTextDecorationNested() throws Exception {
+        XMLResult result = getXML("testWORD_various.docx");
+        String xml = result.xml;
 
+        assertTrue(xml.contains("<i>ita<strike>li</strike>c</i>"));
+        assertTrue(xml.contains("<i>ita<strike>l<u>i</u></strike>c</i>"));
+        assertTrue(xml.contains("<i><u>unde</u><strike><u>r</u></strike><u>line</i></u>"));
     }
 
     @Test
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_various.doc b/tika-parsers/src/test/resources/test-documents/testWORD_various.doc
index a2ad236..4341c22 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testWORD_various.doc and b/tika-parsers/src/test/resources/test-documents/testWORD_various.doc differ
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_various.docx b/tika-parsers/src/test/resources/test-documents/testWORD_various.docx
index 24d9e63..e4122a0 100644
Binary files a/tika-parsers/src/test/resources/test-documents/testWORD_various.docx and b/tika-parsers/src/test/resources/test-documents/testWORD_various.docx differ

-- 
To stop receiving notification emails like this one, please contact
"commits@tika.apache.org" <co...@tika.apache.org>.