You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/12 21:12:58 UTC
tika git commit: TIKA-1255 -- fix hyperlinks in doc/docx if there is
formatting TIKA-2078 -- handle multiple runs within a hyperlink (docx)
Repository: tika
Updated Branches:
refs/heads/master 52be68263 -> 80efc84b6
TIKA-1255 -- fix hyperlinks in doc/docx if there is formatting
TIKA-2078 -- handle multiple runs within a hyperlink (docx)
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/80efc84b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/80efc84b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/80efc84b
Branch: refs/heads/master
Commit: 80efc84b675c8defa5e86b01b85e1dabc84d32f5
Parents: 52be682
Author: tballison <ta...@mitre.org>
Authored: Mon Sep 12 17:12:46 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Sep 12 17:12:46 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 5 +-
.../tika/parser/microsoft/WordExtractor.java | 20 +++++++
.../ooxml/XWPFWordExtractorDecorator.java | 52 +++++++++++++------
.../tika/parser/microsoft/WordParserTest.java | 11 +++-
.../parser/microsoft/ooxml/OOXMLParserTest.java | 11 +++-
.../test-documents/testWORD_boldHyperlink.doc | Bin 0 -> 27136 bytes
.../test-documents/testWORD_boldHyperlink.docx | Bin 0 -> 12382 bytes
7 files changed, 79 insertions(+), 20 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 47f916f..20a10a7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 1.14 - ???
-
+
+ * Fix hyperlinks with formatting in DOC and DOCX (TIKA-1255
+ and TIKA-2078)
+
* Tika now is integrated with the Tensorflow library from Google
and it can use its Inception v3 image classification model to
identify objects in images (TIKA-1993).
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 8d36115..54ba55b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -503,9 +503,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
}
xhtml.startElement("a", "href", url);
+ closeStyleElements(skipStyling, xhtml);
for (CharacterRun cr : texts) {
handleCharacterRun(cr, skipStyling, xhtml);
}
+ closeStyleElements(skipStyling, xhtml);
xhtml.endElement("a");
} else {
// Just output the text ones
@@ -530,6 +532,24 @@ public class WordExtractor extends AbstractPOIFSExtractor {
return i - index;
}
+ private void closeStyleElements(boolean skipStyling, XHTMLContentHandler xhtml) throws SAXException {
+ if (skipStyling) {
+ return;
+ }
+ if (curStrikeThrough) {
+ xhtml.endElement("s");
+ curStrikeThrough = false;
+ }
+ if (curItalic) {
+ xhtml.endElement("i");
+ curItalic = false;
+ }
+ if (curBold) {
+ xhtml.endElement("b");
+ curBold = false;
+ }
+ }
+
//temporary work around for TIKA-1512
private int findHyperlinkEnd(String text, int start) {
int end = text.lastIndexOf('"');
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 6caf803..143c438 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -212,8 +212,40 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
TmpFormatting fmtg = new TmpFormatting(false, false);
+ //hyperlinks may or may not have hyperlink ids
+ String lastHyperlinkId = null;
+ boolean inHyperlink = false;
// Do the iruns
for (IRunElement run : paragraph.getIRuns()) {
+
+ if (run instanceof XWPFHyperlinkRun) {
+ XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
+ if (hyperlinkRun.getHyperlinkId() == null ||
+ !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
+ if (inHyperlink) {
+ //close out the old one
+ xhtml.endElement("a");
+ inHyperlink = false;
+ }
+ lastHyperlinkId = hyperlinkRun.getHyperlinkId();
+ fmtg = closeStyleTags(xhtml, fmtg);
+ XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
+ if (link != null && link.getURL() != null) {
+ xhtml.startElement("a", "href", link.getURL());
+ inHyperlink = true;
+ } else if (hyperlinkRun.getAnchor() != null && hyperlinkRun.getAnchor().length() > 0) {
+ xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
+ inHyperlink = true;
+ }
+ }
+ } else if (inHyperlink) {
+ //if this isn't a hyperlink, but the last one was
+ closeStyleTags(xhtml, fmtg);
+ xhtml.endElement("a");
+ lastHyperlinkId = null;
+ inHyperlink = false;
+ }
+
if (run instanceof XWPFSDT) {
fmtg = closeStyleTags(xhtml, fmtg);
processSDTRun((XWPFSDT) run, xhtml);
@@ -226,6 +258,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
}
closeStyleTags(xhtml, fmtg);
+ if (inHyperlink) {
+ xhtml.endElement("a");
+ }
// Now do any comments for the paragraph
@@ -306,19 +341,6 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
tfmtg.setItalic(run.isItalic());
}
- boolean addedHREF = false;
- if (run instanceof XWPFHyperlinkRun) {
- XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
- XWPFHyperlink link = linkRun.getHyperlink(document);
- if (link != null && link.getURL() != null) {
- xhtml.startElement("a", "href", link.getURL());
- addedHREF = true;
- } else if (linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
- xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
- addedHREF = true;
- }
- }
-
xhtml.characters(run.toString());
// If we have any pictures, output them
@@ -337,10 +359,6 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
}
}
- if (addedHREF) {
- xhtml.endElement("a");
- }
-
return tfmtg;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index c1e3909..8b42ff1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -365,7 +365,7 @@ public class WordParserTest extends TikaTest {
@Test
public void testControlCharacter() throws Exception {
- assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+ assertContains("1. Introduzione<b> </b></a> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
}
@Test
@@ -511,5 +511,14 @@ public class WordParserTest extends TikaTest {
assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
}
+
+ @Test
+ public void testBoldHyperlink() throws Exception {
+ //TIKA-1255
+ String xml = getXML("testWORD_boldHyperlink.doc").xml;
+ xml = xml.replaceAll("\\s+", " ");
+ assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
+ assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 8625fa3..3e984de 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -424,7 +424,7 @@ public class OOXMLParserTest extends TikaTest {
// Links
assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
// Anchor links
- assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
+ assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml);
// Paragraphs with other styles
assertTrue(xml.contains("<p class=\"signature\">This one"));
@@ -1247,6 +1247,15 @@ public class OOXMLParserTest extends TikaTest {
assertContains("1.23456789012345E+15", xml);//16 digit number is treated as scientific notation
assertContains("1.23456789012345E+15", xml);//16 digit formula, ditto
}
+
+ @Test
+ public void testBoldHyperlink() throws Exception {
+ //TIKA-1255
+ String xml = getXML("testWORD_boldHyperlink.docx").xml;
+ xml = xml.replaceAll("\\s+", " ");
+ assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
+ assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc
new file mode 100644
index 0000000..293d00a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc differ
http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx
new file mode 100644
index 0000000..fb23d10
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx differ