You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/23 15:02:17 UTC

[tika] 02/02: TIKA-2909 -- trivial formatting clean up, and add JinSup Kim to CHANGES.txt

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 72fa7951edb1a0dc87daeb82f9b1db7a1e80a253
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Jul 23 11:02:00 2019 -0400

    TIKA-2909 -- trivial formatting clean up, and add JinSup Kim to CHANGES.txt
---
 CHANGES.txt                                         |   3 ++-
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java  |   7 +++++--
 .../org/apache/tika/parser/hwp/HwpV5ParserTest.java |  20 +++++++++++++++++---
 ...st-documents-v5-dist.hwp => testHWP-v5-dist.hwp} | Bin
 .../{test-documents-v5.hwp => testHWP-v5b.hwp}      | Bin
 5 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 3072fd9..4127be7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,7 +10,8 @@ Release 1.22 - ???
    * NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints
      between 0xF000 and 0XF0000 will cause an exception.
 
-   * Add parser for HWP v5 files via SooMyung Lee (soomyung) (TIKA-2909).
+   * Add parser for HWP v5 files via SooMyung Lee (soomyung) and
+     JinSup Kim (ddoleye) (TIKA-2909).
 
    * Fix order of closing streams to avoid "Failed to close temporary resource"
      exception (TIKA-2908).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 4eaedf4..8b8c7eb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -250,8 +250,9 @@ public class HwpTextExtractorV5 implements Serializable {
                                XHTMLContentHandler xhtml) throws IOException, SAXException {
         // read BodyText
         Entry bodyText = root.getEntry("BodyText");
-        if (bodyText == null || !bodyText.isDirectoryEntry())
+        if (bodyText == null || !bodyText.isDirectoryEntry()) {
             throw new IOException("Invalid BodyText");
+        }
 
         Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
         while (iterator.hasNext()) {
@@ -262,8 +263,10 @@ public class HwpTextExtractorV5 implements Serializable {
 
                 InputStream input = new DocumentInputStream(
                         (DocumentEntry) entry);
-                if (header.compressed)
+
+                if (header.compressed) {
                     input = new InflaterInputStream(input, new Inflater(true));
+                }
 
                 HwpStreamReader reader = new HwpStreamReader(input);
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
index 1902e49..df59287 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -31,7 +31,8 @@ public class HwpV5ParserTest extends TikaTest {
     public void testHwpV5Parser() throws Exception {
         for (Parser parser : new Parser[]{new HwpV5Parser(),
                 new AutoDetectParser()}) {
-            XMLResult result = getXML("test-documents-v5.hwp", parser);
+            XMLResult result = getXML("testHWP-v5b.hwp", parser);
+            assertContains("<p>Apache Tika - \uCEE8\uD150\uCE20", result.xml);
             Metadata metadata = result.metadata;
             assertEquals(
                     "application/x-hwp-v5", metadata.get(Metadata.CONTENT_TYPE));
@@ -44,8 +45,9 @@ public class HwpV5ParserTest extends TikaTest {
 
     @Test
     public void testDistributedHwp() throws Exception {
-        XMLResult result = getXML("test-documents-v5-dist.hwp");
-        assertContains("Apache Tika", result.xml);
+        XMLResult result = getXML("testHWP-v5-dist.hwp");
+        String content = result.xml;
+        assertContains("<p>Apache Tika - \uCEE8\uD150\uCE20", content);
 
         assertEquals(
                 "application/x-hwp-v5",
@@ -53,4 +55,16 @@ public class HwpV5ParserTest extends TikaTest {
         assertEquals("Apache Tika", result.metadata.get(TikaCoreProperties.TITLE));
         assertEquals("SooMyung Lee", result.metadata.get(TikaCoreProperties.CREATOR));
     }
+
+    @Test
+    public void testExisting() throws Exception {
+        XMLResult result = getXML("testHWP_5.0.hwp");
+        System.out.println(result.xml);
+        String content = result.xml;
+        Metadata metadata = result.metadata;
+        assertContains("\uD14C\uC2A4\uD2B8", content);
+        assertContains("test", content);
+        assertEquals("next1009", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("\uD14C\uC2A4\uD2B8", metadata.get(TikaCoreProperties.TITLE));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp b/tika-parsers/src/test/resources/test-documents/testHWP-v5-dist.hwp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp
rename to tika-parsers/src/test/resources/test-documents/testHWP-v5-dist.hwp
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp b/tika-parsers/src/test/resources/test-documents/testHWP-v5b.hwp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp
rename to tika-parsers/src/test/resources/test-documents/testHWP-v5b.hwp