You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/18 19:31:19 UTC
[tika] branch master updated: TIKA-2905 -- allow users to ignore
list markup
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new d83b3fa TIKA-2905 -- allow users to ignore list markup
d83b3fa is described below
commit d83b3fafc4ebc3e8337d4a77a3aa9632985831a8
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Jul 18 15:31:08 2019 -0400
TIKA-2905 -- allow users to ignore list markup
---
.../java/org/apache/tika/parser/rtf/RTFParser.java | 4 ++++
.../org/apache/tika/parser/rtf/TextExtractor.java | 20 ++++++++---------
.../org/apache/tika/parser/rtf/RTFParserTest.java | 14 +++++++++++-
.../parser/rtf/ignoreListMarkup-tika-config.xml | 26 ++++++++++++++++++++++
4 files changed, 53 insertions(+), 11 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
index a553dc0..580f48d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
@@ -85,6 +85,9 @@ public class RTFParser extends AbstractParser {
@Field
private int memoryLimitInKb = EMB_OBJ_MAX_BYTES/1024;
+ @Field
+ private boolean ignoreListMarkup = false;
+
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
@@ -95,6 +98,7 @@ public class RTFParser extends AbstractParser {
XHTMLContentHandler xhtmlHandler = new XHTMLContentHandler(handler, metadata);
RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(xhtmlHandler, metadata, context, getMemoryLimitInKb());
final TextExtractor ert = new TextExtractor(xhtmlHandler, metadata, embObjHandler);
+ ert.setIgnoreListMarkup(ignoreListMarkup);
ert.extract(stream);
} catch (IOException e) {
tagged.throwIfCauseOf(e);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index 4758f2d..dfc0956 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -305,7 +305,7 @@ final class TextExtractor {
private Map<Integer, ListDescriptor> currentListTable;
private ListDescriptor currentList;
private int listTableLevel = -1;
- private boolean ignoreLists;
+ private boolean ignoreListMarkup;
// Non-null if we've seen the url for a HYPERLINK but not yet
// its text:
private String pendingURL;
@@ -373,11 +373,11 @@ final class TextExtractor {
}
public boolean isIgnoringLists() {
- return ignoreLists;
+ return ignoreListMarkup;
}
- public void setIgnoreLists(boolean ignore) {
- this.ignoreLists = ignore;
+ public void setIgnoreListMarkup(boolean ignore) {
+ this.ignoreListMarkup = ignore;
}
// Push pending bytes or pending chars:
@@ -1038,7 +1038,7 @@ final class TextExtractor {
}
private boolean inList() {
- return !ignoreLists && groupState.list != 0;
+ return !ignoreListMarkup && groupState.list != 0;
}
/**
@@ -1061,7 +1061,7 @@ final class TextExtractor {
* @throws TikaException
*/
private void endList(int listID) throws IOException, SAXException, TikaException {
- if (!ignoreLists) {
+ if (!ignoreListMarkup) {
String xl = isUnorderedList(listID) ? UL : OL;
if (paragraphStack.size() > 0) {
String p = paragraphStack.pop();
@@ -1084,7 +1084,7 @@ final class TextExtractor {
* @throws TikaException
*/
private void startList(int listID) throws IOException, SAXException, TikaException {
- if (!ignoreLists) {
+ if (!ignoreListMarkup) {
String xl = isUnorderedList(listID) ? UL : OL;
start(xl);
pushParagraphTag(xl);
@@ -1243,9 +1243,9 @@ final class TextExtractor {
} else if (equals("par")) {
if (!ignored) {
endParagraph(true);
- }
- if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
- pendingListEnd();
+ if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
+ pendingListEnd();
+ }
}
} else if (equals("shptxt")) {
pushText();
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index 34c2208..9c48d56 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -51,7 +51,6 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.BasicContentHandlerFactory;
-import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.junit.Test;
@@ -353,6 +352,19 @@ public class RTFParserTest extends TikaTest {
}
@Test
+ public void testTurningOffList() throws Exception {
+ InputStream is = getClass().getResourceAsStream(
+ "/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml");
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+ String content = getXML("testRTFListMicrosoftWord.rtf", p).xml;
+ assertNotContained("<ol>", content);
+ assertNotContained("<ul>", content);
+ assertNotContained("<li>", content);
+ }
+
+ @Test
public void testListLibreOffice() throws Exception {
String content = getXML("testRTFListLibreOffice.rtf").xml;
assertContains("<ol>\t<li>one</li>", content);
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml
new file mode 100644
index 0000000..528dbde
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/rtf/ignoreListMarkup-tika-config.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.rtf.RTFParser">
+ <params>
+ <param name="ignoreListMarkup" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>