You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/09/08 16:47:48 UTC

[tika] branch master updated: TIKA-2459 -- fix special character handling

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new d1a8bff  TIKA-2459 -- fix special character handling
d1a8bff is described below

commit d1a8bff9faacb828a1039f7cc2c7f9e1f1d5e3fd
Author: tballison <ta...@mitre.org>
AuthorDate: Fri Sep 8 12:47:40 2017 -0400

    TIKA-2459 -- fix special character handling
---
 .../org/apache/tika/parser/microsoft/WordExtractor.java |   2 ++
 .../apache/tika/parser/microsoft/WordParserTest.java    |   6 ++++++
 .../testWORD_specialControlCharacter1415.doc            | Bin 0 -> 25600 bytes
 3 files changed, 8 insertions(+)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
index ff07fef..569c881 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WordExtractor.java
@@ -489,6 +489,8 @@ public class WordExtractor extends AbstractPOIFSExtractor {
                     controls = new ArrayList<CharacterRun>();
                 }
                 break;
+            } else if (cr.text().equals("\u0014\u0015")) {
+                has14 = true;
             } else {
                 if (has14) {
                     texts.add(cr);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
index b399d09..b70ba72 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WordParserTest.java
@@ -621,5 +621,11 @@ public class WordParserTest extends TikaTest {
         assertContains("\\s\\up 10(\u3068\u3046\u304D\u3087\u3046),\u6771\u4EAC",
                 getXML("testWORD_phonetic.doc").xml);
     }
+
+    @Test
+    public void testSpecialControlCharacter() throws Exception {
+        //TIKA-2459
+        assertContains("Paragraph one", getXML("testWORD_specialControlCharacter1415.doc").xml);
+    }
 }
 
diff --git a/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc
new file mode 100644
index 0000000..919126c
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWORD_specialControlCharacter1415.doc differ

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].