You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/15 22:56:10 UTC
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 4f599dfa3 TIKA-3972 -- fix closing <a> elements when there are also style elements
4f599dfa3 is described below
commit 4f599dfa3d72c724a846356bf867db45f221170a
Author: tballison <ta...@apache.org>
AuthorDate: Wed Feb 15 17:56:02 2023 -0500
TIKA-3972 -- fix closing <a> elements when there are also style elements
---
CHANGES.txt | 3 +
.../tika/parser/microsoft/rtf/TextExtractor.java | 77 ++++++++++------------
.../tika/parser/microsoft/rtf/RTFParserTest.java | 8 +++
.../test-documents/testRTFHyperlinkAndStyles.rtf | 1 +
4 files changed, 48 insertions(+), 41 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index af11a2430..5c48a2efd 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.7.1 - ???
+ * Fix bug in closing <a> elements in the presence of <b> elements
+ in RTF files (TIKA-3972).
+
* Improve extraction of embedded file names in .docx (TIKA-3968).
* Normalize author, title, subject and description to their Dublin Core
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index 9388b8461..28ca76299 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -605,13 +605,7 @@ final class TextExtractor {
localInParagraph = true;
}
if (!localInParagraph) {
- // Ensure </i></b> order
- if (groupState.italic) {
- end("i");
- }
- if (groupState.bold) {
- end("b");
- }
+ endStyles(groupState);
if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
endList(pendingListEnd);
pendingListEnd = 0;
@@ -626,14 +620,7 @@ final class TextExtractor {
start(P);
pushParagraphTag(P);
}
-
- // Ensure <b><i> order
- if (groupState.bold) {
- start("b");
- }
- if (groupState.italic) {
- start("i");
- }
+ startStyles(groupState);
inParagraph = true;
}
}
@@ -696,12 +683,7 @@ final class TextExtractor {
if (preserveStyles && (groupState.bold || groupState.italic)) {
start(P);
pushParagraphTag(P);
- if (groupState.bold) {
- start("b");
- }
- if (groupState.italic) {
- start("i");
- }
+ startStyles(groupState);
inParagraph = true;
} else {
inParagraph = false;
@@ -1199,10 +1181,7 @@ final class TextExtractor {
end("i");
}
groupState.bold = true;
- start("b");
- if (groupState.italic) {
- start("i");
- }
+ startStyles(groupState);
}
} else if (equals("i")) {
if (!groupState.italic) {
@@ -1218,14 +1197,7 @@ final class TextExtractor {
if (equals("pard")) {
// Reset styles
pushText();
- if (groupState.italic) {
- end("i");
- groupState.italic = false;
- }
- if (groupState.bold) {
- end("b");
- groupState.bold = false;
- }
+ endStyles(groupState);
if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
pendingListEnd();
}
@@ -1233,14 +1205,7 @@ final class TextExtractor {
if (groupState.italic || groupState.bold) {
// Reset styles
pushText();
- if (groupState.italic) {
- end("i");
- groupState.italic = false;
- }
- if (groupState.bold) {
- end("b");
- groupState.bold = false;
- }
+ endStyles(groupState);
}
} else if (equals("par")) {
if (!ignored) {
@@ -1399,6 +1364,36 @@ final class TextExtractor {
}
}
+ private void startStyles(GroupState groupState)
+ throws TikaException, IOException, SAXException {
+ //don't change styles within a <a > element
+ if (fieldState != 0) {
+ return;
+ }
+ // Ensure <b><i> order
+ if (groupState.bold) {
+ start("b");
+ }
+ if (groupState.italic) {
+ start("i");
+ }
+ }
+ private void endStyles(GroupState groupState) throws TikaException, IOException, SAXException {
+ //don't change styles within a <a > element
+ if (fieldState != 0) {
+ return;
+ }
+ // Ensure </i></b> order
+ if (groupState.italic) {
+ end("i");
+ groupState.italic = false;
+ }
+ if (groupState.bold) {
+ end("b");
+ groupState.bold = false;
+ }
+ }
+
// Push new GroupState
private void processGroupStart(PushbackInputStream in) throws IOException {
ansiSkip = 0;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index e9b36fdc5..1fd53fe34 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -311,6 +311,14 @@ public class RTFParserTest extends TikaTest {
assertEquals(-1, content.indexOf("<p>\t\t</p>"));
}
+ @Test
+ public void testHyperLinkAndStyles() throws Exception {
+ String content = getXML("testRTFHyperlinkAndStyles.rtf").xml;
+ String needle = "<b><i>DIP</i>: " +
+ "<a href=\"..\\\\..\\\\SAUCES\\\\Dips\\\\Dip, Caesar.doc\">Dip, Caesar.doc</a></b>";
+ assertContains(needle, content);
+ }
+
@Test
public void testIgnoredControlWord() throws Exception {
assertContains("<p>The quick brown fox jumps over the lazy dog</p>",
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTFHyperlinkAndStyles.rtf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTFHyperlinkAndStyles.rtf
new file mode 100644
index 000000000..c7a82a4c8
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testRTFHyperlinkAndStyles.rtf
@@ -0,0 +1 @@
+{\rtf1\ansi\deff0\uc1\ansicpg1252\deftab720{\fonttbl{\f0\fnil\fcharset1 Arial;}{\f1\fnil\fcharset1 Times New Roman;}{\f2\fnil\fcharset1 WingDings;}}{\colortbl\red0\green0\blue0;\red255\green0\blue0;\red0\green128\blue0;\red0\green0\blue255;\red255\green255\blue0;\red255\green0\blue255;\red128\green0\blue128;\red128\green0\blue0;\red0\green255\blue0;\red0\green255\blue255;\red0\green128\blue128;\red0\green0\blue128;\red255\green255\blue255;\red192\green192\blue192;\red128\green128\blue128 [...]