You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 14:58:09 UTC
[tika] branch branch_1x updated: TIKA-2701 -- via Grigoriy Alekseev
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new d66dcbb TIKA-2701 -- via Grigoriy Alekseev
d66dcbb is described below
commit d66dcbbbcf59f5b4034a47fed3346ad513b1fcc9
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 10:57:55 2018 -0400
TIKA-2701 -- via Grigoriy Alekseev
---
.../org/apache/tika/parser/microsoft/WMFParser.java | 2 +-
.../apache/tika/parser/microsoft/WMFParserTest.java | 20 +++++++++++++-------
.../resources/test-documents/testWMF_charset.wmf | Bin 0 -> 9316 bytes
3 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
index e0a2507..5343751 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/WMFParser.java
@@ -64,8 +64,8 @@ public class WMFParser extends AbstractParser {
HwmfPicture picture = new HwmfPicture(stream);
//TODO: make x/y info public in POI so that we can use it here
//to determine when to keep two text parts on the same line
+ Charset charset = LocaleUtil.CHARSET_1252;
for (HwmfRecord record : picture.getRecords()) {
- Charset charset = LocaleUtil.CHARSET_1252;
//this is pure hackery for specifying the font
//TODO: do what Graphics does by maintaining the stack, etc.!
//This fix should be done within POI
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
index 42fb220..fb2d631 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/WMFParserTest.java
@@ -29,14 +29,20 @@ public class WMFParserTest extends TikaTest {
@Test
public void testTextExtractionWindows() throws Exception {
- List<Metadata> metadataList = getRecursiveMetadata("testXLSX_Thumbnail.xlsx");
- Metadata wmfMetadata = metadataList.get(1);
- assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
- assertContains("This file contains an embedded thumbnail",
- wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ testTextExtraction("testXLSX_Thumbnail.xlsx", 1, "This file contains an embedded thumbnail");
+ }
+
+ @Test
+ public void testTextExtractionShiftJISencoding() throws Exception {
+ testTextExtraction("testWMF_charset.wmf", 0, "普林斯");
}
- //TODO fix wmf text extraction in "testRTFEmbeddedFiles.rtf"
- //Chinese is garbled.
+ private void testTextExtraction(String fileName, int metaDataItemIndex, String expectedText) throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata(fileName);
+ Metadata wmfMetadata = metadataList.get(metaDataItemIndex);
+
+ assertEquals("image/wmf", wmfMetadata.get(Metadata.CONTENT_TYPE));
+ assertContains(expectedText, wmfMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf b/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf
new file mode 100644
index 0000000..b860d18
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testWMF_charset.wmf differ