You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/18 18:37:01 UTC
[tika] 02/02: TIKA-2899 -- improve robustness of list handling in
the RTFParser
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 620134b90b72632fee486ba3aa5b25ff6b271d22
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jul 18 14:35:13 2019 -0400
TIKA-2899 -- improve robustness of list handling in the RTFParser
---
tika-core/src/test/java/org/apache/tika/TikaTest.java | 7 +++++++
.../src/main/java/org/apache/tika/parser/rtf/TextExtractor.java | 4 +++-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 0aaaf35..91e6dc7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -29,6 +29,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
+import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
@@ -241,6 +242,12 @@ public abstract class TikaTest {
}
}
+ protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception {
+ try (TikaInputStream tis = TikaInputStream.get(p)) {
+ return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
+ }
+ }
+
protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index e2733b2..4758f2d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -865,7 +865,6 @@ final class TextExtractor {
// Handle control word that takes a parameter:
private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
-
// TODO: afN? (associated font number)
// TODO: do these alter text output...?
@@ -1245,6 +1244,9 @@ final class TextExtractor {
if (!ignored) {
endParagraph(true);
}
+ if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
+ pendingListEnd();
+ }
} else if (equals("shptxt")) {
pushText();
// Text inside a shape