You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/11/28 13:17:50 UTC

[tika] branch master updated: TIKA-2512 add underline/strikethrough extraction for docx and pptx in SAX-based parsers

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new ef3fc7b  TIKA-2512 add underline/strikethrough extraction for docx and pptx in SAX-based parsers
ef3fc7b is described below

commit ef3fc7bd4de8856038bf31eb0a3dddb343f52d28
Author: tballison <ta...@mitre.org>
AuthorDate: Tue Nov 28 08:17:41 2017 -0500

    TIKA-2512 add underline/strikethrough extraction for docx and pptx in SAX-based parsers
---
 CHANGES.txt                                        |  2 +
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  | 69 +++++++++++++++++++---
 .../ooxml/OOXMLWordAndPowerPointTextHandler.java   | 24 +++++++-
 .../tika/parser/microsoft/ooxml/RunProperties.java | 33 ++++++++++-
 .../parser/microsoft/ooxml/SXWPFExtractorTest.java | 32 ++++++++++
 5 files changed, 149 insertions(+), 11 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 38ec856..4069759 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.17 - ???
 
+  * Extract underline and strikethrough in docx (TIKA-2347 and TIKA-2512).
+
   * Cache TikaConfig in EmbeddedDocumentUtil for better performance
     in documents with large number of attachments (TIKA-2511).
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index ef3b3dc..95cbc5b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 import java.math.BigInteger;
 import java.util.Date;
 
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
@@ -45,6 +46,8 @@ public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandl
     private int sdtDepth = 0;//
     private boolean isItalics = false;
     private boolean isBold = false;
+    private boolean isUnderline = false;
+    private boolean isStrikeThrough = false;
     private boolean wroteHyperlinkStart = false;
 
     //TODO: fix this
@@ -78,29 +81,67 @@ public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandl
     @Override
     public void run(RunProperties runProperties, String contents) {
         try {
+
             // True if we are currently in the named style tag:
-            if (runProperties.getBold() != isBold) {
+            if (runProperties.isBold() != isBold) {
+                if (isStrikeThrough) {
+                    xhtml.endElement("strike");
+                    isStrikeThrough = false;
+                }
+                if (isUnderline) {
+                    xhtml.endElement("u");
+                    isUnderline = false;;
+                }
                 if (isItalics) {
                     xhtml.endElement("i");
                     isItalics = false;
                 }
-                if (runProperties.getBold()) {
+                if (runProperties.isBold()) {
                     xhtml.startElement("b");
-                    isBold = true;
                 } else {
                     xhtml.endElement("b");
-                    isBold = false;
                 }
+                isBold = runProperties.isBold();
             }
 
-            if (runProperties.getItalics() != isItalics) {
-                if (runProperties.getItalics()) {
+            if (runProperties.isItalics() != isItalics) {
+                if (isStrikeThrough) {
+                    xhtml.endElement("strike");
+                    isStrikeThrough = false;
+                }
+                if (isUnderline) {
+                    xhtml.endElement("u");
+                    isUnderline = false;
+                }
+                if (runProperties.isItalics()) {
                     xhtml.startElement("i");
-                    isItalics = true;
                 } else {
                     xhtml.endElement("i");
-                    isItalics = false;
                 }
+                isItalics = runProperties.isItalics();
+            }
+
+            if (runProperties.isStrikeThrough() != isStrikeThrough) {
+                if (isUnderline) {
+                    xhtml.endElement("u");
+                    isUnderline = false;
+                }
+                if (runProperties.isStrikeThrough()) {
+                    xhtml.startElement("strike");
+                } else {
+                    xhtml.endElement("strike");
+                }
+                isStrikeThrough = runProperties.isStrikeThrough();
+            }
+
+            boolean runIsUnderlined = runProperties.getUnderline() != UnderlinePatterns.NONE;
+            if (runIsUnderlined != isUnderline) {
+                if (runIsUnderlined) {
+                    xhtml.startElement("u");
+                } else {
+                    xhtml.endElement("u");
+                }
+                isUnderline = runIsUnderlined;
             }
 
             xhtml.characters(contents);
@@ -371,10 +412,22 @@ public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandl
     }
 
     private void closeStyleTags() throws SAXException {
+
+        if (isStrikeThrough) {
+            xhtml.endElement("strike");
+            isStrikeThrough = false;
+        }
+
+        if (isUnderline) {
+            xhtml.endElement("u");
+            isUnderline = false;
+        }
+
         if (isItalics) {
             xhtml.endElement("i");
             isItalics = false;
         }
+
         if (isBold) {
             xhtml.endElement("b");
             isBold = false;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index f12da58..f0ba265 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser.microsoft.ooxml;
 import java.util.Date;
 import java.util.Map;
 
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.utils.DateUtils;
 import org.xml.sax.Attributes;
@@ -72,6 +73,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String TC = "tc";
     private final static String TR = "tr";
     private final static String I = "i";
+    private final static String U = "u";
+    private final static String STRIKE = "strike";
     private final static String NUM_PR = "numPr";
     private final static String BR = "br";
     private final static String HYPERLINK = "hyperlink";
@@ -88,6 +91,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
     private final static String V = "v";
     private final static String RUBY = "ruby"; //phonetic section
     private final static String RT = "rt"; //phonetic run
+    private static final String VAL = "val";
 
 
     public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
@@ -247,6 +251,14 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
             if (inR && inRPr) {
                 currRunProperties.setItalics(true);
             }
+        } else if (STRIKE.equals(localName)) {
+            if (inR && inRPr) {
+                currRunProperties.setStrike(true);
+            }
+        } else if (U.equals(localName)) {
+            if (inR && inRPr) {
+                currRunProperties.setUnderline(getStringVal(atts));
+            }
         } else if (TR.equals(localName)) {
             bodyContentsHandler.startTableRow();
         } else if (NUM_PR.equals(localName)) {
@@ -354,8 +366,16 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         this.editType = editType;
     }
 
+    private String getStringVal(Attributes atts) {
+        String valString = atts.getValue(W_NS, VAL);
+        if (valString != null) {
+            return valString;
+        }
+        return "";
+    }
+
     private int getIntVal(Attributes atts) {
-        String valString = atts.getValue(W_NS, "val");
+        String valString = atts.getValue(W_NS, VAL);
         if (valString != null) {
             try {
                 return Integer.parseInt(valString);
@@ -454,6 +474,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
         runBuffer.setLength(0);
         currRunProperties.setBold(false);
         currRunProperties.setItalics(false);
+        currRunProperties.setStrike(false);
+        currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
     }
 
     private void handlePict() {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
index 9fbfcd8..aa126af 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -17,6 +17,9 @@
 
 package org.apache.tika.parser.microsoft.ooxml;
 
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTUnderline;
+
 /**
  * WARNING: This class is mutable.  Make a copy of it
  * if you want persistence!
@@ -25,12 +28,15 @@ package org.apache.tika.parser.microsoft.ooxml;
 public class RunProperties {
     boolean italics = false;
     boolean bold = false;
+    boolean strikeThrough = false;
+
+    UnderlinePatterns underline = UnderlinePatterns.NONE;
 
-    public boolean getItalics() {
+    public boolean isItalics() {
         return italics;
     }
 
-    public boolean getBold() {
+    public boolean isBold() {
         return bold;
     }
 
@@ -41,4 +47,27 @@ public class RunProperties {
     public void setBold(boolean bold) {
         this.bold = bold;
     }
+
+    public boolean isStrikeThrough() {
+        return strikeThrough;
+    }
+
+    public void setStrike(boolean strikeThrough) {
+        this.strikeThrough = strikeThrough;
+    }
+
+    public UnderlinePatterns getUnderline() {
+        return underline;
+    }
+
+    public void setUnderline(String underlineString) {
+        if (underlineString == null || underlineString.equals("")) {
+            underline = UnderlinePatterns.SINGLE;
+        } else if (UnderlinePatterns.NONE.name().equals(underlineString)) {
+            underline = UnderlinePatterns.NONE;
+        } else {
+            //TODO -- fill out rest
+            underline = UnderlinePatterns.SINGLE;
+        }
+    }
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 89bd754..063a062 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -838,4 +838,36 @@ public class SXWPFExtractorTest extends TikaTest {
         assertNotContained("\u3068", xml);
     }
 
+    @Test
+    public void testTextDecoration() throws Exception {
+        String xml = getXML("testWORD_various.docx", parseContext).xml;
+
+        assertContains("<b>Bold</b>", xml);
+        assertContains("<i>italic</i>", xml);
+        assertContains("<u>underline</u>", xml);
+        assertContains("<strike>strikethrough</strike>", xml);
+    }
+
+    @Test
+    public void testTextDecorationNested() throws Exception {
+        String xml = getXML("testWORD_various.docx", parseContext).xml;
+
+        assertContains("<i>ita<strike>li</strike>c</i>", xml);
+        assertContains("<i>ita<strike>l<u>i</u></strike>c</i>", xml);
+        assertContains("<i><u>unde</u><strike><u>r</u></strike><u>line</u></i>", xml);
+
+        //confirm that spaces aren't added for <strike/> and <u/>
+        ContentHandler contentHandler = new BodyContentHandler();
+        try (InputStream is = getResourceAsStream("/test-documents/testWORD_various.docx")){
+            new AutoDetectParser().parse(is, contentHandler, new Metadata(), parseContext);
+        }
+        String txt = contentHandler.toString();
+        assertContainsCount("italic", txt, 3);
+        assertNotContained("ita ", txt);
+
+        assertContainsCount("underline", txt, 2);
+        assertNotContained("unde ", txt);
+    }
+
+
 }

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].