You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/15 22:56:10 UTC

[tika] branch main updated: TIKA-3972 -- fix closing elements when there are also style elements

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 4f599dfa3 TIKA-3972 -- fix closing <a> elements when there are also style elements
4f599dfa3 is described below

commit 4f599dfa3d72c724a846356bf867db45f221170a
Author: tballison <ta...@apache.org>
AuthorDate: Wed Feb 15 17:56:02 2023 -0500

    TIKA-3972 -- fix closing <a> elements when there are also style elements
---
 CHANGES.txt                                        |  3 +
 .../tika/parser/microsoft/rtf/TextExtractor.java   | 77 ++++++++++------------
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |  8 +++
 .../test-documents/testRTFHyperlinkAndStyles.rtf   |  1 +
 4 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index af11a2430..5c48a2efd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.7.1 - ???
 
+   * Fix bug in closing <a> elements in the presence of <b> elements
+     in RTF files (TIKA-3972).
+
    * Improve extraction of embedded file names in .docx (TIKA-3968).
 
    * Normalize author, title, subject and description to their Dublin Core
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 9388b8461..28ca76299 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -605,13 +605,7 @@ final class TextExtractor {
             localInParagraph = true;
         }
         if (!localInParagraph) {
-            // Ensure </i></b> order
-            if (groupState.italic) {
-                end("i");
-            }
-            if (groupState.bold) {
-                end("b");
-            }
+            endStyles(groupState);
             if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
                 endList(pendingListEnd);
                 pendingListEnd = 0;
@@ -626,14 +620,7 @@ final class TextExtractor {
                 start(P);
                 pushParagraphTag(P);
             }
-
-            // Ensure <b><i> order
-            if (groupState.bold) {
-                start("b");
-            }
-            if (groupState.italic) {
-                start("i");
-            }
+            startStyles(groupState);
             inParagraph = true;
         }
     }
@@ -696,12 +683,7 @@ final class TextExtractor {
             if (preserveStyles && (groupState.bold || groupState.italic)) {
                 start(P);
                 pushParagraphTag(P);
-                if (groupState.bold) {
-                    start("b");
-                }
-                if (groupState.italic) {
-                    start("i");
-                }
+                startStyles(groupState);
                 inParagraph = true;
             } else {
                 inParagraph = false;
@@ -1199,10 +1181,7 @@ final class TextExtractor {
                         end("i");
                     }
                     groupState.bold = true;
-                    start("b");
-                    if (groupState.italic) {
-                        start("i");
-                    }
+                    startStyles(groupState);
                 }
             } else if (equals("i")) {
                 if (!groupState.italic) {
@@ -1218,14 +1197,7 @@ final class TextExtractor {
         if (equals("pard")) {
             // Reset styles
             pushText();
-            if (groupState.italic) {
-                end("i");
-                groupState.italic = false;
-            }
-            if (groupState.bold) {
-                end("b");
-                groupState.bold = false;
-            }
+            endStyles(groupState);
             if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
                 pendingListEnd();
             }
@@ -1233,14 +1205,7 @@ final class TextExtractor {
             if (groupState.italic || groupState.bold) {
                 // Reset styles
                 pushText();
-                if (groupState.italic) {
-                    end("i");
-                    groupState.italic = false;
-                }
-                if (groupState.bold) {
-                    end("b");
-                    groupState.bold = false;
-                }
+                endStyles(groupState);
             }
         } else if (equals("par")) {
             if (!ignored) {
@@ -1399,6 +1364,36 @@ final class TextExtractor {
         }
     }
 
+    private void startStyles(GroupState groupState)
+            throws TikaException, IOException, SAXException {
+        //don't change styles within a <a > element
+        if (fieldState != 0) {
+            return;
+        }
+        // Ensure <b><i> order
+        if (groupState.bold) {
+            start("b");
+        }
+        if (groupState.italic) {
+            start("i");
+        }
+    }
+    private void endStyles(GroupState groupState) throws TikaException, IOException, SAXException {
+        //don't change styles within a <a > element
+        if (fieldState != 0) {
+            return;
+        }
+        // Ensure </i></b> order
+        if (groupState.italic) {
+            end("i");
+            groupState.italic = false;
+        }
+        if (groupState.bold) {
+            end("b");
+            groupState.bold = false;
+        }
+    }
+
     // Push new GroupState
     private void processGroupStart(PushbackInputStream in) throws IOException {
         ansiSkip = 0;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index e9b36fdc5..1fd53fe34 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -311,6 +311,14 @@ public class RTFParserTest extends TikaTest {
         assertEquals(-1, content.indexOf("<p>\t\t</p>"));
     }
 
+    @Test
+    public void testHyperLinkAndStyles() throws Exception {
+        String content = getXML("testRTFHyperlinkAndStyles.rtf").xml;
+        String needle = "<b><i>DIP</i>: " +
+                "<a href=\"..\\\\..\\\\SAUCES\\\\Dips\\\\Dip, Caesar.doc\">Dip, Caesar.doc</a></b>";
+        assertContains(needle, content);
+    }
+
     @Test
     public void testIgnoredControlWord() throws Exception {
         assertContains("<p>The quick brown fox jumps over the lazy dog</p>",
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTFHyperlinkAndStyles.rtf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTFHyperlinkAndStyles.rtf
new file mode 100644
index 000000000..c7a82a4c8
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTFHyperlinkAndStyles.rtf
@@ -0,0 +1 @@
+{\rtf1\ansi\deff0\uc1\ansicpg1252\deftab720{\fonttbl{\f0\fnil\fcharset1 Arial;}{\f1\fnil\fcharset1 Times New Roman;}{\f2\fnil\fcharset1 WingDings;}}{\colortbl\red0\green0\blue0;\red255\green0\blue0;\red0\green128\blue0;\red0\green0\blue255;\red255\green255\blue0;\red255\green0\blue255;\red128\green0\blue128;\red128\green0\blue0;\red0\green255\blue0;\red0\green255\blue255;\red0\green128\blue128;\red0\green0\blue128;\red255\green255\blue255;\red192\green192\blue192;\red128\green128\blue128 [...]