You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/05/29 15:13:37 UTC
[tika] branch master updated (0dc4451 -> f6644c5)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.
from 0dc4451 convert miredot repo to https://
new 2d6870b TIKA-2878 -- update vulnerable jackson version
new c240c56 swap http -> https
new f6644c5 TIKA-2883 -- improve recognition of leaving the RTF header
The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
pom.xml | 4 ++--
tika-parent/pom.xml | 2 +-
.../org/apache/tika/parser/rtf/TextExtractor.java | 15 +++++++++++++--
.../org/apache/tika/parser/rtf/RTFParserTest.java | 21 +++++++++++++++++++++
.../tika/parser/ner/opennlp/ModelGetter.groovy | 2 +-
.../resources/test-documents/testRTFTIKA_1713.rtf | 1 +
.../resources/test-documents/testRTFTIKA_2150.rtf | 6 ++++++
.../resources/test-documents/testRTFTIKA_2500.rtf | 10 ++++++++++
.../resources/test-documents/testRTFTIKA_2883.rtf | Bin 0 -> 1526 bytes
9 files changed, 55 insertions(+), 6 deletions(-)
create mode 100644 tika-parsers/src/test/resources/test-documents/testRTFTIKA_1713.rtf
create mode 100644 tika-parsers/src/test/resources/test-documents/testRTFTIKA_2150.rtf
create mode 100644 tika-parsers/src/test/resources/test-documents/testRTFTIKA_2500.rtf
create mode 100644 tika-parsers/src/test/resources/test-documents/testRTFTIKA_2883.rtf
[tika] 02/03: swap http -> https
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit c240c56afe3b168aa9e56634671592fecfa79960
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed May 29 11:11:05 2019 -0400
swap http -> https
---
pom.xml | 4 ++--
.../resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/pom.xml b/pom.xml
index f1fa8c1..5afe327 100644
--- a/pom.xml
+++ b/pom.xml
@@ -32,7 +32,7 @@
<artifactId>tika</artifactId>
<packaging>pom</packaging>
<name>Apache Tika</name>
- <url>http://tika.apache.org</url>
+ <url>https://tika.apache.org</url>
<modules>
<module>tika-parent</module>
@@ -192,7 +192,7 @@ least three +1 Tika PMC votes are cast.
</description>
<organization>
<name>The Apache Software Foundation</name>
- <url>http://www.apache.org</url>
+ <url>https://www.apache.org</url>
</organization>
<issueManagement>
<system>JIRA</system>
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy b/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
index 7560ee7..dcdfb1f 100644
--- a/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/ner/opennlp/ModelGetter.groovy
@@ -95,7 +95,7 @@ def downloadFile(String urlStr, File file) {
}
}
-def urlPrefix = "http://opennlp.sourceforge.net/models-1.5"
+def urlPrefix = "https://opennlp.sourceforge.net/models-1.5"
def prefixPath = "src/test/resources/org/apache/tika/parser/ner/opennlp/"
def ageUrlPrefix = "https://raw.githubusercontent.com/USCDataScience/AgePredictor/master/model"
def agePrefixPath = "src/test/resources/org/apache/tika/parser/recognition/"
[tika] 01/03: TIKA-2878 -- update vulnerable jackson version
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 2d6870bb56ca03a699e64a96310ce8558130a906
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed May 29 11:10:08 2019 -0400
TIKA-2878 -- update vulnerable jackson version
---
tika-parent/pom.xml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 198062a..4dbc851 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -344,7 +344,7 @@
<cxf.version>3.3.2</cxf.version>
<slf4j.version>1.7.26</slf4j.version>
- <jackson.version>2.9.8</jackson.version>
+ <jackson.version>2.9.9</jackson.version>
<!-- when this is next upgraded, see if we can get rid of
javax.activation dependency in tika-server -->
<jaxb.version>2.3.2</jaxb.version>
[tika] 03/03: TIKA-2883 -- improve recognition of leaving the RTF
header
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit f6644c577416c20ab8827ac857d75cf6874e093d
Author: TALLISON <ta...@apache.org>
AuthorDate: Wed May 29 11:13:23 2019 -0400
TIKA-2883 -- improve recognition of leaving the RTF header
---
.../org/apache/tika/parser/rtf/TextExtractor.java | 15 +++++++++++++--
.../org/apache/tika/parser/rtf/RTFParserTest.java | 21 +++++++++++++++++++++
.../resources/test-documents/testRTFTIKA_1713.rtf | 1 +
.../resources/test-documents/testRTFTIKA_2150.rtf | 6 ++++++
.../resources/test-documents/testRTFTIKA_2500.rtf | 10 ++++++++++
.../resources/test-documents/testRTFTIKA_2883.rtf | Bin 0 -> 1526 bytes
6 files changed, 51 insertions(+), 2 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index bc194dd..06c89b8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -282,7 +282,9 @@ final class TextExtractor {
// immediately open the top group (start with {):
private GroupState groupState = new GroupState();
private boolean inHeader = true;
- private int fontTableState;
+ //0 not yet in font table, 1 in font table, 2 have processed font table
+ private int fontTableState = 0;
+ //depth at which the font table started
private int fontTableDepth;
// Non null if we are processing metadata (title,
// keywords, etc.) inside the info group:
@@ -865,6 +867,12 @@ final class TextExtractor {
}
}
}
+ //if you've already seen the font table,
+ //you aren't in another header item (e.g. styles)
+ //and you see an fX, you're out of the header
+ if (fontTableState == 2 && ! groupState.ignore && equals("f")) {
+ inHeader = false;
+ }
if (currentList != null) {
if (equals("listid")) {
@@ -1096,7 +1104,10 @@ final class TextExtractor {
}
}
- if (!groupState.ignore && (equals("par") || equals("pard") || equals("sect") || equals("sectd") || equals("plain") || equals("ltrch") || equals("rtlch"))) {
+ if (!groupState.ignore && (equals("par") ||
+ equals("pard") || equals("sect") || equals("sectd") || equals("plain") ||
+ equals("ltrch") || equals("rtlch")
+ || equals("htmlrtf") || equals("line"))) {
inHeader = false;
}
} else {
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
index edc20e5..6654245 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
@@ -552,6 +552,27 @@ public class RTFParserTest extends TikaTest {
assertContains("supercali ATB Allison, Timothy B. This is a comment fragilistic",
getXML("testRTF_annotation_spacing.rtf").xml);
}
+
+ @Test
+ public void testTIKA1713() throws Exception {
+ assertContains("For discussion", getXML("testRTFTIKA_1713.rtf").xml);
+ }
+
+ @Test
+ public void testTIKA2150() throws Exception {
+ assertContains("TO\tFROM", getXML("testRTFTIKA_2150.rtf").xml);
+ }
+ @Test
+ public void testTIKA2500() throws Exception {
+ assertContains("Level1", getXML("testRTFTIKA_2500.rtf").xml);
+ }
+
+ @Test
+ public void testTIKA2883() throws Exception {
+ assertContains("This message has been archived.",
+ getXML("testRTFTIKA_2883.rtf").xml);
+ }
+
private Result getResult(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);
diff --git a/tika-parsers/src/test/resources/test-documents/testRTFTIKA_1713.rtf b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_1713.rtf
new file mode 100644
index 0000000..d404d46
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_1713.rtf
@@ -0,0 +1 @@
+{\rtf1\ansi\ansicpg65001\fromhtml1 \deff0{\fonttbl{\f0\fswiss Arial;}}{\*\htmltag0 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd"><HTML><HEAD><STYLE type=text/css>DIV.EVShortcutBanner \{PADDING:10px;FONT-WEIGHT:bold;FONT-SIZE:10pt;FONT-FAMILY:verdana;BACKGROUND-COLOR:lightblue\}DIV.EVAttachBanner \{PADDING:3px;FONT-WEIGHT:bold;FONT-SIZE:10pt;FONT-FAMILY:verdana;BACKGROUND-COLOR:lightblue\}DIV.EVShortcutMsgBody \{ [...]
\ No newline at end of file
diff --git a/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2150.rtf b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2150.rtf
new file mode 100644
index 0000000..124c828
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2150.rtf
@@ -0,0 +1,6 @@
+{\rtf1\ansi
+{\fonttbl\f0\fnil Monospaced;\f1\fnil Tahoma;\f2\fnil Times New Roman;}
+
+\li0\ri0\fi0\f2\fs36\i0\b0\ul0\cf0 TO\tab FROM\tab TEXT\par
+\tab\tab aa bb cc dd \par
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2500.rtf b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2500.rtf
new file mode 100644
index 0000000..105c537
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2500.rtf
@@ -0,0 +1,10 @@
+{\rtf1\ansi\ansicpg1252\uc1\deff0 {\rtf1\ansi\deff0 {\fonttbl{\f0 Arial;}}\f0 {\line
+
+{\b Level1}
+: \par}
+
+{\line This is level1 paragraph\line This is level1 paragraph\line This is level1 paragraph\line This is level1 paragraph\line This is level1 paragraph\line This is level1 paragraph\par}
+{\line
+
+{\b Level2}
+: \par} This is level2 paragraph.\line This is level2 paragraph.\line This is level2 paragraph.\par} } \line \par}
\ No newline at end of file
diff --git a/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2883.rtf b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2883.rtf
new file mode 100644
index 0000000..e20c4e1
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testRTFTIKA_2883.rtf differ