You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/18 18:37:01 UTC

[tika] 02/02: TIKA-2899 -- improve robustness of list handling in the RTFParser

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 620134b90b72632fee486ba3aa5b25ff6b271d22
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jul 18 14:35:13 2019 -0400

    TIKA-2899 -- improve robustness of list handling in the RTFParser
---
 tika-core/src/test/java/org/apache/tika/TikaTest.java              | 7 +++++++
 .../src/main/java/org/apache/tika/parser/rtf/TextExtractor.java    | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 0aaaf35..91e6dc7 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -29,6 +29,7 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.net.URISyntaxException;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -241,6 +242,12 @@ public abstract class TikaTest {
         }
     }
 
+    protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(p)) {
+            return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);
+        }
+    }
+
     protected List<Metadata> getRecursiveMetadata(InputStream is, boolean suppressException) throws Exception {
         return getRecursiveMetadata(is, new ParseContext(), new Metadata(), suppressException);
     }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index e2733b2..4758f2d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -865,7 +865,6 @@ final class TextExtractor {
 
     // Handle control word that takes a parameter:
     private void processControlWord(int param, PushbackInputStream in) throws IOException, SAXException, TikaException {
-
         // TODO: afN?  (associated font number)
 
         // TODO: do these alter text output...?
@@ -1245,6 +1244,9 @@ final class TextExtractor {
             if (!ignored) {
                 endParagraph(true);
             }
+            if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
+                pendingListEnd();
+            }
         } else if (equals("shptxt")) {
             pushText();
             // Text inside a shape