You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/09/12 21:12:58 UTC

tika git commit: TIKA-1255 -- fix hyperlinks in doc/docx if there is formatting TIKA-2078 -- handle multiple runs within a hyperlink (docx)

Repository: tika
Updated Branches:
  refs/heads/master 52be68263 -> 80efc84b6


TIKA-1255 -- fix hyperlinks in doc/docx if there is formatting
TIKA-2078 -- handle multiple runs within a hyperlink (docx)


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/80efc84b
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/80efc84b
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/80efc84b

Branch: refs/heads/master
Commit: 80efc84b675c8defa5e86b01b85e1dabc84d32f5
Parents: 52be682
Author: tballison <ta...@mitre.org>
Authored: Mon Sep 12 17:12:46 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Sep 12 17:12:46 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   5 +-
 .../tika/parser/microsoft/WordExtractor.java    |  20 +++++++
 .../ooxml/XWPFWordExtractorDecorator.java       |  52 +++++++++++++------
 .../tika/parser/microsoft/WordParserTest.java   |  11 +++-
 .../parser/microsoft/ooxml/OOXMLParserTest.java |  11 +++-
 .../test-documents/testWORD_boldHyperlink.doc   | Bin 0 -> 27136 bytes
 .../test-documents/testWORD_boldHyperlink.docx  | Bin 0 -> 12382 bytes
 7 files changed, 79 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 47f916f..20a10a7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 1.14 - ???
- 
+
+  * Fix hyperlinks with formatting in DOC and DOCX (TIKA-1255
+    and TIKA-2078)
+
   * Tika now is integrated with the Tensorflow library from Google 
     and it can use its Inception v3 image classification model to 
     identify objects in images (TIKA-1993).

http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 8d36115..54ba55b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -503,9 +503,11 @@ public class WordExtractor extends AbstractPOIFSExtractor {
                 }
 
                 xhtml.startElement("a", "href", url);
+                closeStyleElements(skipStyling, xhtml);
                 for (CharacterRun cr : texts) {
                     handleCharacterRun(cr, skipStyling, xhtml);
                 }
+                closeStyleElements(skipStyling, xhtml);
                 xhtml.endElement("a");
             } else {
                 // Just output the text ones
@@ -530,6 +532,24 @@ public class WordExtractor extends AbstractPOIFSExtractor {
         return i - index;
     }
 
+    private void closeStyleElements(boolean skipStyling, XHTMLContentHandler xhtml) throws SAXException {
+        if (skipStyling) {
+            return;
+        }
+        if (curStrikeThrough) {
+            xhtml.endElement("s");
+            curStrikeThrough = false;
+        }
+        if (curItalic) {
+            xhtml.endElement("i");
+            curItalic = false;
+        }
+        if (curBold) {
+            xhtml.endElement("b");
+            curBold = false;
+        }
+    }
+
     //temporary work around for TIKA-1512
     private int findHyperlinkEnd(String text, int start) {
         int end = text.lastIndexOf('"');

http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 6caf803..143c438 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -212,8 +212,40 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
 
         TmpFormatting fmtg = new TmpFormatting(false, false);
 
+        //hyperlinks may or may not have hyperlink ids
+        String lastHyperlinkId = null;
+        boolean inHyperlink = false;
         // Do the iruns
         for (IRunElement run : paragraph.getIRuns()) {
+
+            if (run instanceof XWPFHyperlinkRun) {
+                XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
+                if (hyperlinkRun.getHyperlinkId() == null ||
+                        !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
+                    if (inHyperlink) {
+                        //close out the old one
+                        xhtml.endElement("a");
+                        inHyperlink = false;
+                    }
+                    lastHyperlinkId = hyperlinkRun.getHyperlinkId();
+                    fmtg = closeStyleTags(xhtml, fmtg);
+                    XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
+                    if (link != null && link.getURL() != null) {
+                        xhtml.startElement("a", "href", link.getURL());
+                        inHyperlink = true;
+                    } else if (hyperlinkRun.getAnchor() != null && hyperlinkRun.getAnchor().length() > 0) {
+                        xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
+                        inHyperlink = true;
+                    }
+                }
+            } else if (inHyperlink) {
+                //if this isn't a hyperlink, but the last one was
+                closeStyleTags(xhtml, fmtg);
+                xhtml.endElement("a");
+                lastHyperlinkId = null;
+                inHyperlink = false;
+            }
+
             if (run instanceof XWPFSDT) {
                 fmtg = closeStyleTags(xhtml, fmtg);
                 processSDTRun((XWPFSDT) run, xhtml);
@@ -226,6 +258,9 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             }
         }
         closeStyleTags(xhtml, fmtg);
+        if (inHyperlink) {
+            xhtml.endElement("a");
+        }
 
 
         // Now do any comments for the paragraph
@@ -306,19 +341,6 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             tfmtg.setItalic(run.isItalic());
         }
 
-        boolean addedHREF = false;
-        if (run instanceof XWPFHyperlinkRun) {
-            XWPFHyperlinkRun linkRun = (XWPFHyperlinkRun) run;
-            XWPFHyperlink link = linkRun.getHyperlink(document);
-            if (link != null && link.getURL() != null) {
-                xhtml.startElement("a", "href", link.getURL());
-                addedHREF = true;
-            } else if (linkRun.getAnchor() != null && linkRun.getAnchor().length() > 0) {
-                xhtml.startElement("a", "href", "#" + linkRun.getAnchor());
-                addedHREF = true;
-            }
-        }
-
         xhtml.characters(run.toString());
 
         // If we have any pictures, output them
@@ -337,10 +359,6 @@ public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
             }
         }
 
-        if (addedHREF) {
-            xhtml.endElement("a");
-        }
-
         return tfmtg;
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index c1e3909..8b42ff1 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -365,7 +365,7 @@ public class WordParserTest extends TikaTest {
 
     @Test
     public void testControlCharacter() throws Exception {
-        assertContains("1. Introduzione<b> </a></b> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+        assertContains("1. Introduzione<b> </b></a> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
     }
 
     @Test
@@ -511,5 +511,14 @@ public class WordParserTest extends TikaTest {
         assertContains("C:\\Users\\tallison\\Desktop\\tmp\\New folder (2)\\embed1.zip",
                 Arrays.asList(embed1_zip_metadata.getValues(TikaCoreProperties.ORIGINAL_RESOURCE_NAME)));
     }
+
+    @Test
+    public void testBoldHyperlink() throws Exception {
+        //TIKA-1255
+        String xml = getXML("testWORD_boldHyperlink.doc").xml;
+        xml = xml.replaceAll("\\s+", " ");
+        assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
+        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+    }
 }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 8625fa3..3e984de 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -424,7 +424,7 @@ public class OOXMLParserTest extends TikaTest {
         // Links
         assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
         // Anchor links
-        assertTrue(xml.contains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>"));
+        assertContains("<a href=\"#OnMainHeading\">The Main Heading Bookmark</a>", xml);
         // Paragraphs with other styles
         assertTrue(xml.contains("<p class=\"signature\">This one"));
 
@@ -1247,6 +1247,15 @@ public class OOXMLParserTest extends TikaTest {
         assertContains("1.23456789012345E+15", xml);//16 digit number is treated as scientific notation
         assertContains("1.23456789012345E+15", xml);//16 digit formula, ditto
     }
+
+    @Test
+    public void testBoldHyperlink() throws Exception {
+        //TIKA-1255
+        String xml = getXML("testWORD_boldHyperlink.docx").xml;
+        xml = xml.replaceAll("\\s+", " ");
+        assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
+        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+    }
 }
 
 

http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc
new file mode 100644
index 0000000..293d00a
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.doc differ

http://git-wip-us.apache.org/repos/asf/tika/blob/80efc84b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx
new file mode 100644
index 0000000..fb23d10
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_boldHyperlink.docx differ