You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/11/28 13:17:50 UTC
[tika] branch master updated: TIKA-2512 add underline/strikethrough
extraction for docx and pptx in SAX-based parsers
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new ef3fc7b TIKA-2512 add underline/strikethrough extraction for docx and pptx in SAX-based parsers
ef3fc7b is described below
commit ef3fc7bd4de8856038bf31eb0a3dddb343f52d28
Author: tballison <ta...@mitre.org>
AuthorDate: Tue Nov 28 08:17:41 2017 -0500
TIKA-2512 add underline/strikethrough extraction for docx and pptx in SAX-based parsers
---
CHANGES.txt | 2 +
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 69 +++++++++++++++++++---
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 24 +++++++-
.../tika/parser/microsoft/ooxml/RunProperties.java | 33 ++++++++++-
.../parser/microsoft/ooxml/SXWPFExtractorTest.java | 32 ++++++++++
5 files changed, 149 insertions(+), 11 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 38ec856..4069759 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.17 - ???
+ * Extract underline and strikethrough in docx (TIKA-2347 and TIKA-2512).
+
* Cache TikaConfig in EmbeddedDocumentUtil for better performance
in documents with large number of attachments (TIKA-2511).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index ef3b3dc..95cbc5b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.math.BigInteger;
import java.util.Date;
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.WordExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
@@ -45,6 +46,8 @@ public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandl
private int sdtDepth = 0;//
private boolean isItalics = false;
private boolean isBold = false;
+ private boolean isUnderline = false;
+ private boolean isStrikeThrough = false;
private boolean wroteHyperlinkStart = false;
//TODO: fix this
@@ -78,29 +81,67 @@ public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandl
@Override
public void run(RunProperties runProperties, String contents) {
try {
+
// True if we are currently in the named style tag:
- if (runProperties.getBold() != isBold) {
+ if (runProperties.isBold() != isBold) {
+ if (isStrikeThrough) {
+ xhtml.endElement("strike");
+ isStrikeThrough = false;
+ }
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;;
+ }
if (isItalics) {
xhtml.endElement("i");
isItalics = false;
}
- if (runProperties.getBold()) {
+ if (runProperties.isBold()) {
xhtml.startElement("b");
- isBold = true;
} else {
xhtml.endElement("b");
- isBold = false;
}
+ isBold = runProperties.isBold();
}
- if (runProperties.getItalics() != isItalics) {
- if (runProperties.getItalics()) {
+ if (runProperties.isItalics() != isItalics) {
+ if (isStrikeThrough) {
+ xhtml.endElement("strike");
+ isStrikeThrough = false;
+ }
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
+ if (runProperties.isItalics()) {
xhtml.startElement("i");
- isItalics = true;
} else {
xhtml.endElement("i");
- isItalics = false;
}
+ isItalics = runProperties.isItalics();
+ }
+
+ if (runProperties.isStrikeThrough() != isStrikeThrough) {
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
+ if (runProperties.isStrikeThrough()) {
+ xhtml.startElement("strike");
+ } else {
+ xhtml.endElement("strike");
+ }
+ isStrikeThrough = runProperties.isStrikeThrough();
+ }
+
+ boolean runIsUnderlined = runProperties.getUnderline() != UnderlinePatterns.NONE;
+ if (runIsUnderlined != isUnderline) {
+ if (runIsUnderlined) {
+ xhtml.startElement("u");
+ } else {
+ xhtml.endElement("u");
+ }
+ isUnderline = runIsUnderlined;
}
xhtml.characters(contents);
@@ -371,10 +412,22 @@ public class OOXMLTikaBodyPartHandler implements OOXMLWordAndPowerPointTextHandl
}
private void closeStyleTags() throws SAXException {
+
+ if (isStrikeThrough) {
+ xhtml.endElement("strike");
+ isStrikeThrough = false;
+ }
+
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
+
if (isItalics) {
xhtml.endElement("i");
isItalics = false;
}
+
if (isBold) {
xhtml.endElement("b");
isBold = false;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index f12da58..f0ba265 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser.microsoft.ooxml;
import java.util.Date;
import java.util.Map;
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.utils.DateUtils;
import org.xml.sax.Attributes;
@@ -72,6 +73,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private final static String TC = "tc";
private final static String TR = "tr";
private final static String I = "i";
+ private final static String U = "u";
+ private final static String STRIKE = "strike";
private final static String NUM_PR = "numPr";
private final static String BR = "br";
private final static String HYPERLINK = "hyperlink";
@@ -88,6 +91,7 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
private final static String V = "v";
private final static String RUBY = "ruby"; //phonetic section
private final static String RT = "rt"; //phonetic run
+ private static final String VAL = "val";
public final static String W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
@@ -247,6 +251,14 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
if (inR && inRPr) {
currRunProperties.setItalics(true);
}
+ } else if (STRIKE.equals(localName)) {
+ if (inR && inRPr) {
+ currRunProperties.setStrike(true);
+ }
+ } else if (U.equals(localName)) {
+ if (inR && inRPr) {
+ currRunProperties.setUnderline(getStringVal(atts));
+ }
} else if (TR.equals(localName)) {
bodyContentsHandler.startTableRow();
} else if (NUM_PR.equals(localName)) {
@@ -354,8 +366,16 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
this.editType = editType;
}
+ private String getStringVal(Attributes atts) {
+ String valString = atts.getValue(W_NS, VAL);
+ if (valString != null) {
+ return valString;
+ }
+ return "";
+ }
+
private int getIntVal(Attributes atts) {
- String valString = atts.getValue(W_NS, "val");
+ String valString = atts.getValue(W_NS, VAL);
if (valString != null) {
try {
return Integer.parseInt(valString);
@@ -454,6 +474,8 @@ public class OOXMLWordAndPowerPointTextHandler extends DefaultHandler {
runBuffer.setLength(0);
currRunProperties.setBold(false);
currRunProperties.setItalics(false);
+ currRunProperties.setStrike(false);
+ currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
}
private void handlePict() {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
index 9fbfcd8..aa126af 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -17,6 +17,9 @@
package org.apache.tika.parser.microsoft.ooxml;
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTUnderline;
+
/**
* WARNING: This class is mutable. Make a copy of it
* if you want persistence!
@@ -25,12 +28,15 @@ package org.apache.tika.parser.microsoft.ooxml;
public class RunProperties {
boolean italics = false;
boolean bold = false;
+ boolean strikeThrough = false;
+
+ UnderlinePatterns underline = UnderlinePatterns.NONE;
- public boolean getItalics() {
+ public boolean isItalics() {
return italics;
}
- public boolean getBold() {
+ public boolean isBold() {
return bold;
}
@@ -41,4 +47,27 @@ public class RunProperties {
public void setBold(boolean bold) {
this.bold = bold;
}
+
+ public boolean isStrikeThrough() {
+ return strikeThrough;
+ }
+
+ public void setStrike(boolean strikeThrough) {
+ this.strikeThrough = strikeThrough;
+ }
+
+ public UnderlinePatterns getUnderline() {
+ return underline;
+ }
+
+ public void setUnderline(String underlineString) {
+ if (underlineString == null || underlineString.equals("")) {
+ underline = UnderlinePatterns.SINGLE;
+ } else if (UnderlinePatterns.NONE.name().equals(underlineString)) {
+ underline = UnderlinePatterns.NONE;
+ } else {
+ //TODO -- fill out rest
+ underline = UnderlinePatterns.SINGLE;
+ }
+ }
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
index 89bd754..063a062 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/SXWPFExtractorTest.java
@@ -838,4 +838,36 @@ public class SXWPFExtractorTest extends TikaTest {
assertNotContained("\u3068", xml);
}
+ @Test
+ public void testTextDecoration() throws Exception {
+ String xml = getXML("testWORD_various.docx", parseContext).xml;
+
+ assertContains("<b>Bold</b>", xml);
+ assertContains("<i>italic</i>", xml);
+ assertContains("<u>underline</u>", xml);
+ assertContains("<strike>strikethrough</strike>", xml);
+ }
+
+ @Test
+ public void testTextDecorationNested() throws Exception {
+ String xml = getXML("testWORD_various.docx", parseContext).xml;
+
+ assertContains("<i>ita<strike>li</strike>c</i>", xml);
+ assertContains("<i>ita<strike>l<u>i</u></strike>c</i>", xml);
+ assertContains("<i><u>unde</u><strike><u>r</u></strike><u>line</u></i>", xml);
+
+ //confirm that spaces aren't added for <strike/> and <u/>
+ ContentHandler contentHandler = new BodyContentHandler();
+ try (InputStream is = getResourceAsStream("/test-documents/testWORD_various.docx")){
+ new AutoDetectParser().parse(is, contentHandler, new Metadata(), parseContext);
+ }
+ String txt = contentHandler.toString();
+ assertContainsCount("italic", txt, 3);
+ assertNotContained("ita ", txt);
+
+ assertContainsCount("underline", txt, 2);
+ assertNotContained("unde ", txt);
+ }
+
+
}
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].