You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@tika.apache.org by "ASF GitHub Bot (JIRA)" <ji...@apache.org> on 2017/11/24 01:16:01 UTC

[jira] [Commented] (TIKA-2347) Underlined text is not decorated as such when extracting from word documents

    [ https://issues.apache.org/jira/browse/TIKA-2347?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16264860#comment-16264860 ] 

ASF GitHub Bot commented on TIKA-2347:
--------------------------------------

dameikle closed pull request #173: Fix for TIKA-2347 Adds underline extraction from word documents
URL: https://github.com/apache/tika/pull/173
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index 31809250d..90fbd6c37 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -84,6 +84,7 @@
     private boolean curStrikeThrough;
     private boolean curBold;
     private boolean curItalic;
+    private boolean curUnderline;
 
     private final Metadata metadata;
 
@@ -372,20 +373,8 @@ private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocu
             }
         }
 
-        // Close any still open style tags
-        if (curStrikeThrough) {
-            xhtml.endElement("s");
-            curStrikeThrough = false;
-        }
-        if (curItalic) {
-            xhtml.endElement("i");
-            curItalic = false;
-        }
-        if (curBold) {
-            xhtml.endElement("b");
-            curBold = false;
-        }
-
+        closeStyleElements(false, xhtml);
+        
         xhtml.endElement(tas.getTag());
 
         return 0;
@@ -399,7 +388,11 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte
 
         if (!skipStyling) {
             if (cr.isBold() != curBold) {
-                // Enforce nesting -- must close s and i tags
+                // Enforce nesting -- must close u, s and i tags
+                if (curUnderline) {
+                    xhtml.endElement("u");
+                    curUnderline = false;
+                }
                 if (curStrikeThrough) {
                     xhtml.endElement("s");
                     curStrikeThrough = false;
@@ -417,7 +410,11 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte
             }
 
             if (cr.isItalic() != curItalic) {
-                // Enforce nesting -- must close s tag
+                // Enforce nesting -- must close u and s tag
+            	if (curUnderline) {
+            		xhtml.endElement("u");
+            		curUnderline = false;
+            	}
                 if (curStrikeThrough) {
                     xhtml.endElement("s");
                     curStrikeThrough = false;
@@ -431,6 +428,11 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte
             }
 
             if (cr.isStrikeThrough() != curStrikeThrough) {
+                // Enforce nesting -- must close u tag
+                if (curUnderline) {
+                    xhtml.endElement("u");
+                    curUnderline = false;
+                }
                 if (cr.isStrikeThrough()) {
                     xhtml.startElement("s");
                 } else {
@@ -438,6 +440,16 @@ private void handleCharacterRun(CharacterRun cr, boolean skipStyling, XHTMLConte
                 }
                 curStrikeThrough = cr.isStrikeThrough();
             }
+            
+            boolean isUnderline = cr.getUnderlineCode() != 0;
+            if (isUnderline != curUnderline) {
+                if (isUnderline) {
+                    xhtml.startElement("u");
+                } else {
+                    xhtml.endElement("u");
+                }
+                curUnderline = isUnderline;
+            }
         }
 
         // Clean up the text
@@ -546,6 +558,10 @@ private void closeStyleElements(boolean skipStyling, XHTMLContentHandler xhtml)
         if (skipStyling) {
             return;
         }
+        if (curUnderline) {
+        	xhtml.endElement("u");
+        	curUnderline = false;
+        }
         if (curStrikeThrough) {
             xhtml.endElement("s");
             curStrikeThrough = false;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
index 39a72c6a2..23a1aedac 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java
@@ -33,6 +33,7 @@
 import org.apache.poi.xwpf.usermodel.ICell;
 import org.apache.poi.xwpf.usermodel.IRunElement;
 import org.apache.poi.xwpf.usermodel.ISDTContent;
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
@@ -224,7 +225,7 @@ private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManag
             xhtml.endElement("a");
         }
 
-        TmpFormatting fmtg = new TmpFormatting(false, false);
+        TmpFormatting fmtg = new TmpFormatting(false, false, false);
 
         //hyperlinks may or may not have hyperlink ids
         String lastHyperlinkId = null;
@@ -328,6 +329,10 @@ private TmpFormatting closeStyleTags(XHTMLContentHandler xhtml,
             xhtml.endElement("b");
             fmtg.setBold(false);
         }
+        if (fmtg.isUnderline()) {
+        	xhtml.endElement("u");
+        	fmtg.setUnderline(false);
+        }
         return fmtg;
     }
 
@@ -336,6 +341,10 @@ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
             throws SAXException, XmlException, IOException {
         // True if we are currently in the named style tag:
         if (run.isBold() != tfmtg.isBold()) {
+            if (tfmtg.isUnderline()) {
+                xhtml.endElement("u");
+                tfmtg.setUnderline(false);
+            }
             if (tfmtg.isItalic()) {
                 xhtml.endElement("i");
                 tfmtg.setItalic(false);
@@ -349,6 +358,10 @@ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
         }
 
         if (run.isItalic() != tfmtg.isItalic()) {
+            if (tfmtg.isUnderline()) {
+                xhtml.endElement("u");
+                tfmtg.setUnderline(false);
+            }
             if (run.isItalic()) {
                 xhtml.startElement("i");
             } else {
@@ -356,6 +369,16 @@ private TmpFormatting processRun(XWPFRun run, XWPFParagraph paragraph,
             }
             tfmtg.setItalic(run.isItalic());
         }
+        
+        boolean isUnderline = run.getUnderline() != UnderlinePatterns.NONE;
+        if (isUnderline != tfmtg.isUnderline()) {
+            if (isUnderline) {
+                xhtml.startElement("u");
+            } else {
+                xhtml.endElement("u");
+            }
+            tfmtg.setUnderline(isUnderline);
+        }
 
         xhtml.characters(run.toString());
 
@@ -484,10 +507,12 @@ private void addRelatedParts(PackagePart documentPart, List<PackagePart> related
     private class TmpFormatting {
         private boolean bold = false;
         private boolean italic = false;
+        private boolean underline = false;
 
-        private TmpFormatting(boolean bold, boolean italic) {
+        private TmpFormatting(boolean bold, boolean italic, boolean underline) {
             this.bold = bold;
             this.italic = italic;
+            this.underline = underline;
         }
 
         public boolean isBold() {
@@ -505,6 +530,15 @@ public boolean isItalic() {
         public void setItalic(boolean italic) {
             this.italic = italic;
         }
+        
+
+        public boolean isUnderline() {
+            return underline;
+        }
+
+        public void setUnderline(boolean underline) {
+            this.underline = underline;
+        }
 
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index df6d807fc..7938c3b1c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -29,6 +29,7 @@
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.tika.TikaTest;
+
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -109,7 +110,7 @@ public void testWordHTML() throws Exception {
         assertTrue(xml.contains("<td>"));
         // TODO - Check for the nested table
         // Links
-        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\">Tika</a>"));
+        assertTrue(xml.contains("<a href=\"http://tika.apache.org/\"><u>Tika</u></a>"));
         // Paragraphs with other styles
         assertTrue(xml.contains("<p class=\"signature\">This one"));
 
@@ -195,6 +196,17 @@ public void testWord6Parser() throws Exception {
             assertContains("The quick brown fox jumps over the lazy dog", handler.toString());
         }
     }
+    
+    @Test
+    public void testTextDecoration() throws Exception {
+      XMLResult result = getXML("testWORD_various.doc");
+      String xml = result.xml;
+
+      assertTrue(xml.contains("<b>Bold</b>"));
+      assertTrue(xml.contains("<i>italic</i>"));
+      assertTrue(xml.contains("<u>underline</u>"));
+
+    }
 
     @Test
     public void testVarious() throws Exception {
@@ -361,15 +373,15 @@ public void testHeaderHyperlinks() throws Exception {
         assertFalse(xml.contains("HYPERLINK"));
 
         // Check we do have the link
-        assertContains("<a href=\"http://tw-systemhaus.de\">http:", xml);
+        assertContains("<a href=\"http://tw-systemhaus.de\"><u>http:", xml);
 
         // Check we do have the email
-        assertContains("<a href=\"mailto:ab@example.com\">ab@", xml);
+        assertContains("<a href=\"mailto:ab@example.com\"><u>ab@", xml);
     }
 
     @Test
     public void testControlCharacter() throws Exception {
-        assertContains("1. Introduzione<b> </b></a> </p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
+        assertContains("<u>1.</u> <u>Introduzione</u><b> </b></a><u> </u></p>", getXML("testControlCharacters.doc").xml.replaceAll("\\s+", " "));
     }
 
     @Test
@@ -383,7 +395,7 @@ public void testParagraphsAfterTables() throws Exception {
                 "application/msword",
                 metadata.get(Metadata.CONTENT_TYPE));
 
-        assertContains("<p>1. Organisering av vakten:</p>", xml);
+        assertContains("<p><u>1. Organisering av vakten:</u></p>", xml);
 
     }
 
@@ -521,8 +533,8 @@ public void testBoldHyperlink() throws Exception {
         //TIKA-1255
         String xml = getXML("testWORD_boldHyperlink.doc").xml;
         xml = xml.replaceAll("\\s+", " ");
-        assertContains("<a href=\"http://tika.apache.org/\">hyper <b>link</b></a>", xml);
-        assertContains("<a href=\"http://tika.apache.org/\"><b>hyper</b> link</a>; bold" , xml);
+        assertContains("<a href=\"http://tika.apache.org/\"><u>hyper </u><b><u>link</u></b></a>", xml);
+        assertContains("<a href=\"http://tika.apache.org/\"><b><u>hyper</u></b><u> link</u></a>; bold" , xml);
     }
 
     @Test
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index 642054536..06c5a1e0b 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -579,6 +579,17 @@ public void testNullHeaders() throws Exception {
             assertEquals("Should have found some text", false, handler.toString().isEmpty());
         }
     }
+    
+    @Test
+    public void testTextDecoration() throws Exception {
+      XMLResult result = getXML("testWORD_various.docx");
+      String xml = result.xml;
+
+      assertTrue(xml.contains("<b>Bold</b>"));
+      assertTrue(xml.contains("<i>italic</i>"));
+      assertTrue(xml.contains("<u>underline</u>"));
+
+    }
 
     @Test
     public void testVarious() throws Exception {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


> Underlined text is not decorated as such when extracting from word documents
> ----------------------------------------------------------------------------
>
>                 Key: TIKA-2347
>                 URL: https://issues.apache.org/jira/browse/TIKA-2347
>             Project: Tika
>          Issue Type: Bug
>          Components: parser
>    Affects Versions: 2.0, 1.14
>            Reporter: Stuart Hendren
>            Assignee: Dave Meikle
>             Fix For: 1.17
>
>
> When extracting from doc and docx bold and italic text decoration is extracted, however underlining is not.  Can be demonstrated in WordParserTest or OOXMLParserTest (change to docx) with the following test case.
> {code:title=WordParserTest.java|borderStyle=solid}
>     @Test
>     public void testTextDecoration() throws Exception {
>       XMLResult result = getXML("testWORD_various.doc");
>       String xml = result.xml;
>       assertTrue(xml.contains("<b>Bold</b>"));
>       assertTrue(xml.contains("<i>italic</i>"));
>       assertTrue(xml.contains("<u>underline</u>"));
>     }
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)