You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/23 15:02:17 UTC
[tika] 02/02: TIKA-2909 -- trivial formatting clean up,
and add JinSup Kim to CHANGES.txt
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 72fa7951edb1a0dc87daeb82f9b1db7a1e80a253
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Jul 23 11:02:00 2019 -0400
TIKA-2909 -- trivial formatting clean up, and add JinSup Kim to CHANGES.txt
---
CHANGES.txt | 3 ++-
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 7 +++++--
.../org/apache/tika/parser/hwp/HwpV5ParserTest.java | 20 +++++++++++++++++---
...st-documents-v5-dist.hwp => testHWP-v5-dist.hwp} | Bin
.../{test-documents-v5.hwp => testHWP-v5b.hwp} | Bin
5 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 3072fd9..4127be7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,7 +10,8 @@ Release 1.22 - ???
* NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints
between 0xF000 and 0XF0000 will cause an exception.
- * Add parser for HWP v5 files via SooMyung Lee (soomyung) (TIKA-2909).
+ * Add parser for HWP v5 files via SooMyung Lee (soomyung) and
+ JinSup Kim (ddoleye) (TIKA-2909).
* Fix order of closing streams to avoid "Failed to close temporary resource"
exception (TIKA-2908).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 4eaedf4..8b8c7eb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -250,8 +250,9 @@ public class HwpTextExtractorV5 implements Serializable {
XHTMLContentHandler xhtml) throws IOException, SAXException {
// read BodyText
Entry bodyText = root.getEntry("BodyText");
- if (bodyText == null || !bodyText.isDirectoryEntry())
+ if (bodyText == null || !bodyText.isDirectoryEntry()) {
throw new IOException("Invalid BodyText");
+ }
Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
while (iterator.hasNext()) {
@@ -262,8 +263,10 @@ public class HwpTextExtractorV5 implements Serializable {
InputStream input = new DocumentInputStream(
(DocumentEntry) entry);
- if (header.compressed)
+
+ if (header.compressed) {
input = new InflaterInputStream(input, new Inflater(true));
+ }
HwpStreamReader reader = new HwpStreamReader(input);
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
index 1902e49..df59287 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -31,7 +31,8 @@ public class HwpV5ParserTest extends TikaTest {
public void testHwpV5Parser() throws Exception {
for (Parser parser : new Parser[]{new HwpV5Parser(),
new AutoDetectParser()}) {
- XMLResult result = getXML("test-documents-v5.hwp", parser);
+ XMLResult result = getXML("testHWP-v5b.hwp", parser);
+ assertContains("<p>Apache Tika - \uCEE8\uD150\uCE20", result.xml);
Metadata metadata = result.metadata;
assertEquals(
"application/x-hwp-v5", metadata.get(Metadata.CONTENT_TYPE));
@@ -44,8 +45,9 @@ public class HwpV5ParserTest extends TikaTest {
@Test
public void testDistributedHwp() throws Exception {
- XMLResult result = getXML("test-documents-v5-dist.hwp");
- assertContains("Apache Tika", result.xml);
+ XMLResult result = getXML("testHWP-v5-dist.hwp");
+ String content = result.xml;
+ assertContains("<p>Apache Tika - \uCEE8\uD150\uCE20", content);
assertEquals(
"application/x-hwp-v5",
@@ -53,4 +55,16 @@ public class HwpV5ParserTest extends TikaTest {
assertEquals("Apache Tika", result.metadata.get(TikaCoreProperties.TITLE));
assertEquals("SooMyung Lee", result.metadata.get(TikaCoreProperties.CREATOR));
}
+
+ @Test
+ public void testExisting() throws Exception {
+ XMLResult result = getXML("testHWP_5.0.hwp");
+ System.out.println(result.xml);
+ String content = result.xml;
+ Metadata metadata = result.metadata;
+ assertContains("\uD14C\uC2A4\uD2B8", content);
+ assertContains("test", content);
+ assertEquals("next1009", metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("\uD14C\uC2A4\uD2B8", metadata.get(TikaCoreProperties.TITLE));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp b/tika-parsers/src/test/resources/test-documents/testHWP-v5-dist.hwp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp
rename to tika-parsers/src/test/resources/test-documents/testHWP-v5-dist.hwp
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp b/tika-parsers/src/test/resources/test-documents/testHWP-v5b.hwp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp
rename to tika-parsers/src/test/resources/test-documents/testHWP-v5b.hwp