You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/15 19:33:51 UTC
[tika] branch master updated: TIKA-2899 -- prevent non-aligned tags
in xhtml output, take 2; see 658656.rtf
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 03fa48a TIKA-2899 -- prevent non-aligned tags in xhtml output, take 2; see 658656.rtf
03fa48a is described below
commit 03fa48a887262a72146e6227da4b5eff8c6f6024
Author: TALLISON <ta...@apache.org>
AuthorDate: Mon Jul 15 15:33:37 2019 -0400
TIKA-2899 -- prevent non-aligned tags in xhtml output, take 2; see 658656.rtf
---
.../org/apache/tika/parser/rtf/TextExtractor.java | 24 +++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index afbe273..e2733b2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -601,7 +601,12 @@ final class TextExtractor {
}
private void lazyStartParagraph() throws IOException, SAXException, TikaException {
- if (!inParagraph) {
+
+ boolean localInParagraph = inParagraph;
+ if (paragraphStack.size() > 0 && paragraphStack.contains(P)) {
+ localInParagraph = true;
+ }
+ if (!localInParagraph) {
// Ensure </i></b> order
if (groupState.italic) {
end("i");
@@ -617,10 +622,10 @@ final class TextExtractor {
startList(groupState.list);
}
if (inList()) {
- start("li");
+ start(LI);
pushParagraphTag(LI);
} else {
- start("p");
+ start(P);
pushParagraphTag(P);
}
@@ -649,7 +654,7 @@ final class TextExtractor {
if (!inParagraph) {
lazyStartParagraph();
}
- if (inParagraph) {
+ if (inParagraph || paragraphStack.size() > 0) {
if (groupState.italic) {
end("i");
groupState.italic = preserveStyles;
@@ -658,6 +663,7 @@ final class TextExtractor {
end("b");
groupState.bold = preserveStyles;
}
+ boolean badTagAlignment = false;
if (inList()) {
if (paragraphStack.size() > 0) {
String lastP = paragraphStack.pop();
@@ -665,6 +671,7 @@ final class TextExtractor {
end(LI);
} else {
pushParagraphTag(lastP);
+ badTagAlignment = true;
}
} else {
//there should have been a starting li
@@ -676,10 +683,17 @@ final class TextExtractor {
end(P);
} else {
pushParagraphTag(lastP);
+ badTagAlignment = true;
}
}
}
-
+ //if there was a failure in tag alignment,
+ //dump all tags and start fresh.
+ if (badTagAlignment) {
+ while (paragraphStack.size() > 0) {
+ end(paragraphStack.pop());
+ }
+ }
if (preserveStyles && (groupState.bold || groupState.italic)) {
start(P);
pushParagraphTag(P);