You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/15 19:33:51 UTC

[tika] branch master updated: TIKA-2899 -- prevent non-aligned tags in xhtml output, take 2; see 658656.rtf

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 03fa48a  TIKA-2899 -- prevent non-aligned tags in xhtml output, take 2; see 658656.rtf
03fa48a is described below

commit 03fa48a887262a72146e6227da4b5eff8c6f6024
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jul 15 15:33:37 2019 -0400

    TIKA-2899 -- prevent non-aligned tags in xhtml output, take 2; see 658656.rtf
---
 .../org/apache/tika/parser/rtf/TextExtractor.java  | 24 +++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index afbe273..e2733b2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -601,7 +601,12 @@ final class TextExtractor {
     }
 
     private void lazyStartParagraph() throws IOException, SAXException, TikaException {
-        if (!inParagraph) {
+
+        boolean localInParagraph = inParagraph;
+        if (paragraphStack.size() > 0 && paragraphStack.contains(P)) {
+            localInParagraph = true;
+        }
+        if (!localInParagraph) {
             // Ensure </i></b> order
             if (groupState.italic) {
                 end("i");
@@ -617,10 +622,10 @@ final class TextExtractor {
                 startList(groupState.list);
             }
             if (inList()) {
-                start("li");
+                start(LI);
                 pushParagraphTag(LI);
             } else {
-                start("p");
+                start(P);
                 pushParagraphTag(P);
             }
 
@@ -649,7 +654,7 @@ final class TextExtractor {
         if (!inParagraph) {
             lazyStartParagraph();
         }
-        if (inParagraph) {
+        if (inParagraph || paragraphStack.size() > 0) {
             if (groupState.italic) {
                 end("i");
                 groupState.italic = preserveStyles;
@@ -658,6 +663,7 @@ final class TextExtractor {
                 end("b");
                 groupState.bold = preserveStyles;
             }
+            boolean badTagAlignment = false;
             if (inList()) {
                 if (paragraphStack.size() > 0) {
                     String lastP = paragraphStack.pop();
@@ -665,6 +671,7 @@ final class TextExtractor {
                         end(LI);
                     } else {
                         pushParagraphTag(lastP);
+                        badTagAlignment = true;
                     }
                 } else {
                     //there should have been a starting li
@@ -676,10 +683,17 @@ final class TextExtractor {
                         end(P);
                     } else {
                         pushParagraphTag(lastP);
+                        badTagAlignment = true;
                     }
                 }
             }
-
+            //if there was a failure in tag alignment,
+            //dump all tags and start fresh.
+            if (badTagAlignment) {
+                while (paragraphStack.size() > 0) {
+                    end(paragraphStack.pop());
+                }
+            }
             if (preserveStyles && (groupState.bold || groupState.italic)) {
                 start(P);
                 pushParagraphTag(P);