You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 14:58:09 UTC

[tika] branch branch_1x updated: TIKA-2701 -- via Grigoriy Alekseev

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new d66dcbb  TIKA-2701 -- via Grigoriy Alekseev
d66dcbb is described below

commit d66dcbbbcf59f5b4034a47fed3346ad513b1fcc9
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 10:57:55 2018 -0400

    TIKA-2701 -- via Grigoriy Alekseev
---
 .../org/apache/tika/parser/microsoft/WMFParser.java |   2 +-
 .../apache/tika/parser/microsoft/WMFParserTest.java |  20 +++++++++++++-------
 .../resources/test-documents/testWMF_charset.wmf    | Bin 0 -> 9316 bytes
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
index e0a2507..5343751 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
@@ -64,8 +64,8 @@ public class WMFParser extends AbstractParser {
             HwmfPicture picture = new HwmfPicture(stream);
             //TODO: make x/y info public in POI so that we can use it here
             //to determine when to keep two text parts on the same line
+            Charset charset = LocaleUtil.CHARSET_1252;
             for (HwmfRecord record : picture.getRecords()) {
-                Charset charset = LocaleUtil.CHARSET_1252;
                 //this is pure hackery for specifying the font
                 //TODO: do what Graphics does by maintaining the stack, etc.!
                 //This fix should be done within POI
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
index 42fb220..fb2d631 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
@@ -29,14 +29,20 @@ public class WMFParserTest extends TikaTest {
 
     @Test
     public void testTextExtractionWindows() throws Exception {
-        List<Metadata> metadataList = getRecursiveMetadata("testXLSX_Thumbnail.xlsx");
-        Metadata wmfMetadata = metadataList.get(1);
-        assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
-        assertContains("This file contains an embedded thumbnail",
-                wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+        testTextExtraction("testXLSX_Thumbnail.xlsx", 1, "This file contains an embedded thumbnail");
+    }
+
+    @Test
+    public void testTextExtractionShiftJISencoding() throws Exception {
+        testTextExtraction("testWMF_charset.wmf", 0, "普林斯");
     }
 
-    //TODO fix wmf text extraction in "testRTFEmbeddedFiles.rtf"
-    //Chinese is garbled.
+    private void testTextExtraction(String fileName, int metaDataItemIndex, String expectedText) throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata(fileName);
+        Metadata wmfMetadata = metadataList.get(metaDataItemIndex);
+
+        assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
+        assertContains(expectedText, wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
 }
 
diff --git a/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf b/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf
new file mode 100644
index 0000000..b860d18
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf differ