You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/08 16:47:48 UTC
[tika] branch master updated: TIKA-2459 -- fix special character
handling
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new d1a8bff TIKA-2459 -- fix special character handling
d1a8bff is described below
commit d1a8bff9faacb828a1039f7cc2c7f9e1f1d5e3fd
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Sep 8 12:47:40 2017 -0400
TIKA-2459 -- fix special character handling
---
.../org/apache/tika/parser/microsoft/WordExtractor.java | 2 ++
.../apache/tika/parser/microsoft/WordParserTest.java | 6 ++++++
.../testWORD_specialControlCharacter1415.doc | Bin 0 -> 25600 bytes
3 files changed, 8 insertions(+)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index ff07fef..569c881 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -489,6 +489,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
controls = new ArrayList<CharacterRun>();
}
break;
+ } else if (cr.text().equals("\u0014\u0015")) {
+ has14 = true;
} else {
if (has14) {
texts.add(cr);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index b399d09..b70ba72 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -621,5 +621,11 @@ public class WordParserTest extends TikaTest {
assertContains("\\s\\up 10(\u3068\u3046\u304D\u3087\u3046),\u6771\u4EAC",
getXML("testWORD_phonetic.doc").xml);
}
+
+ @Test
+ public void testSpecialControlCharacter() throws Exception {
+ //TIKA-2459
+ assertContains("Paragraph one", getXML("testWORD_specialControlCharacter1415.doc").xml);
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
new file mode 100644
index 0000000..919126c
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc differ
--
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].