You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/18 19:31:19 UTC

[tika] branch master updated: TIKA-2905 -- allow users to ignore list markup

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d83b3fa  TIKA-2905 -- allow users to ignore list markup
d83b3fa is described below

commit d83b3fafc4ebc3e8337d4a77a3aa9632985831a8
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jul 18 15:31:08 2019 -0400

    TIKA-2905 -- allow users to ignore list markup
---
 .../java/org/apache/tika/parser/rtf/RTFParser.java |  4 ++++
 .../org/apache/tika/parser/rtf/TextExtractor.java  | 20 ++++++++---------
 .../org/apache/tika/parser/rtf/RTFParserTest.java  | 14 +++++++++++-
 .../parser/rtf/ignoreListMarkup-tika-config.xml    | 26 ++++++++++++++++++++++
 4 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index a553dc0..580f48d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -85,6 +85,9 @@ public class RTFParser extends AbstractParser {
     @Field
     private int memoryLimitInKb = EMB_OBJ_MAX_BYTES/1024;
 
+    @Field
+    private boolean ignoreListMarkup = false;
+
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
@@ -95,6 +98,7 @@ public class RTFParser extends AbstractParser {
             XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
             RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
             final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
+            ert.setIgnoreListMarkup(ignoreListMarkup);
             ert.extract(stream);
         } catch (IOException e) {
             tagged.throwIfCauseOf(e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index 4758f2d..dfc0956 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -305,7 +305,7 @@ final class TextExtractor {
     private Map<Integer, ListDescriptor> currentListTable;
     private ListDescriptor currentList;
     private int listTableLevel = -1;
-    private boolean ignoreLists;
+    private boolean ignoreListMarkup;
     // Non-null if we've seen the url for a HYPERLINK but not yet
     // its text:
     private String pendingURL;
@@ -373,11 +373,11 @@ final class TextExtractor {
     }
 
     public boolean isIgnoringLists() {
-        return ignoreLists;
+        return ignoreListMarkup;
     }
 
-    public void setIgnoreLists(boolean ignore) {
-        this.ignoreLists = ignore;
+    public void setIgnoreListMarkup(boolean ignore) {
+        this.ignoreListMarkup = ignore;
     }
 
     // Push pending bytes or pending chars:
@@ -1038,7 +1038,7 @@ final class TextExtractor {
     }
 
     private boolean inList() {
-        return !ignoreLists && groupState.list != 0;
+        return !ignoreListMarkup && groupState.list != 0;
     }
 
     /**
@@ -1061,7 +1061,7 @@ final class TextExtractor {
      * @throws TikaException
      */
     private void endList(int listID) throws IOException, SAXException, TikaException {
-        if (!ignoreLists) {
+        if (!ignoreListMarkup) {
             String xl = isUnorderedList(listID) ? UL : OL;
             if (paragraphStack.size() > 0) {
                 String p = paragraphStack.pop();
@@ -1084,7 +1084,7 @@ final class TextExtractor {
      * @throws TikaException
      */
     private void startList(int listID) throws IOException, SAXException, TikaException {
-        if (!ignoreLists) {
+        if (!ignoreListMarkup) {
             String xl = isUnorderedList(listID) ? UL : OL;
             start(xl);
             pushParagraphTag(xl);
@@ -1243,9 +1243,9 @@ final class TextExtractor {
         } else if (equals("par")) {
             if (!ignored) {
                 endParagraph(true);
-            }
-            if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
-                pendingListEnd();
+                if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
+                    pendingListEnd();
+                }
             }
         } else if (equals("shptxt")) {
             pushText();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index 34c2208..9c48d56 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -51,7 +51,6 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
 import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.junit.Test;
@@ -353,6 +352,19 @@ public class RTFParserTest extends TikaTest {
     }
 
     @Test
+    public void testTurningOffList() throws Exception {
+        InputStream is = getClass().getResourceAsStream(
+                "/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml");
+        assertNotNull(is);
+        TikaConfig tikaConfig = new TikaConfig(is);
+        Parser p = new AutoDetectParser(tikaConfig);
+        String content = getXML("testRTFListMicrosoftWord.rtf", p).xml;
+        assertNotContained("<ol>", content);
+        assertNotContained("<ul>", content);
+        assertNotContained("<li>", content);
+    }
+
+    @Test
     public void testListLibreOffice() throws Exception {
         String content = getXML("testRTFListLibreOffice.rtf").xml;
         assertContains("<ol>\t<li>one</li>", content);
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml
new file mode 100644
index 0000000..528dbde
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.rtf.RTFParser">
+            <params>
+                <param name="ignoreListMarkup" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>