You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2019/07/23 15:02:15 UTC

[tika] branch master updated (1845c4c -> 72fa795)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 1845c4c  TIKA-2909 -- mea culpa...sorry.  Make HwpTextExtractorV5 serializable and fix the locale problems.
     new cdba490  Add more options to TikaTest
     new 72fa795  TIKA-2909 -- trivial formatting clean up, and add JinSup Kim to CHANGES.txt

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                         |   3 ++-
 .../src/test/java/org/apache/tika/TikaTest.java     |   6 ++++++
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java  |   7 +++++--
 .../org/apache/tika/parser/hwp/HwpV5ParserTest.java |  20 +++++++++++++++++---
 ...st-documents-v5-dist.hwp => testHWP-v5-dist.hwp} | Bin
 .../{test-documents-v5.hwp => testHWP-v5b.hwp}      | Bin
 6 files changed, 30 insertions(+), 6 deletions(-)
 rename tika-parsers/src/test/resources/test-documents/{test-documents-v5-dist.hwp => testHWP-v5-dist.hwp} (100%)
 rename tika-parsers/src/test/resources/test-documents/{test-documents-v5.hwp => testHWP-v5b.hwp} (100%)


[tika] 02/02: TIKA-2909 -- trivial formatting clean up, and add JinSup Kim to CHANGES.txt

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 72fa7951edb1a0dc87daeb82f9b1db7a1e80a253
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Jul 23 11:02:00 2019 -0400

    TIKA-2909 -- trivial formatting clean up, and add JinSup Kim to CHANGES.txt
---
 CHANGES.txt                                         |   3 ++-
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java  |   7 +++++--
 .../org/apache/tika/parser/hwp/HwpV5ParserTest.java |  20 +++++++++++++++++---
 ...st-documents-v5-dist.hwp => testHWP-v5-dist.hwp} | Bin
 .../{test-documents-v5.hwp => testHWP-v5b.hwp}      | Bin
 5 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 3072fd9..4127be7 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -10,7 +10,8 @@ Release 1.22 - ???
    * NOTE: Known regression: PDFBOX-4587 -- PDF passwords with codepoints
      between 0xF000 and 0XF0000 will cause an exception.
 
-   * Add parser for HWP v5 files via SooMyung Lee (soomyung) (TIKA-2909).
+   * Add parser for HWP v5 files via SooMyung Lee (soomyung) and
+     JinSup Kim (ddoleye) (TIKA-2909).
 
    * Fix order of closing streams to avoid "Failed to close temporary resource"
      exception (TIKA-2908).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 4eaedf4..8b8c7eb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -250,8 +250,9 @@ public class HwpTextExtractorV5 implements Serializable {
                                XHTMLContentHandler xhtml) throws IOException, SAXException {
         // read BodyText
         Entry bodyText = root.getEntry("BodyText");
-        if (bodyText == null || !bodyText.isDirectoryEntry())
+        if (bodyText == null || !bodyText.isDirectoryEntry()) {
             throw new IOException("Invalid BodyText");
+        }
 
         Iterator<Entry> iterator = ((DirectoryEntry) bodyText).getEntries();
         while (iterator.hasNext()) {
@@ -262,8 +263,10 @@ public class HwpTextExtractorV5 implements Serializable {
 
                 InputStream input = new DocumentInputStream(
                         (DocumentEntry) entry);
-                if (header.compressed)
+
+                if (header.compressed) {
                     input = new InflaterInputStream(input, new Inflater(true));
+                }
 
                 HwpStreamReader reader = new HwpStreamReader(input);
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
index 1902e49..df59287 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/hwp/HwpV5ParserTest.java
@@ -31,7 +31,8 @@ public class HwpV5ParserTest extends TikaTest {
     public void testHwpV5Parser() throws Exception {
         for (Parser parser : new Parser[]{new HwpV5Parser(),
                 new AutoDetectParser()}) {
-            XMLResult result = getXML("test-documents-v5.hwp", parser);
+            XMLResult result = getXML("testHWP-v5b.hwp", parser);
+            assertContains("<p>Apache Tika - \uCEE8\uD150\uCE20", result.xml);
             Metadata metadata = result.metadata;
             assertEquals(
                     "application/x-hwp-v5", metadata.get(Metadata.CONTENT_TYPE));
@@ -44,8 +45,9 @@ public class HwpV5ParserTest extends TikaTest {
 
     @Test
     public void testDistributedHwp() throws Exception {
-        XMLResult result = getXML("test-documents-v5-dist.hwp");
-        assertContains("Apache Tika", result.xml);
+        XMLResult result = getXML("testHWP-v5-dist.hwp");
+        String content = result.xml;
+        assertContains("<p>Apache Tika - \uCEE8\uD150\uCE20", content);
 
         assertEquals(
                 "application/x-hwp-v5",
@@ -53,4 +55,16 @@ public class HwpV5ParserTest extends TikaTest {
         assertEquals("Apache Tika", result.metadata.get(TikaCoreProperties.TITLE));
         assertEquals("SooMyung Lee", result.metadata.get(TikaCoreProperties.CREATOR));
     }
+
+    @Test
+    public void testExisting() throws Exception {
+        XMLResult result = getXML("testHWP_5.0.hwp");
+        System.out.println(result.xml);
+        String content = result.xml;
+        Metadata metadata = result.metadata;
+        assertContains("\uD14C\uC2A4\uD2B8", content);
+        assertContains("test", content);
+        assertEquals("next1009", metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("\uD14C\uC2A4\uD2B8", metadata.get(TikaCoreProperties.TITLE));
+    }
 }
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp b/tika-parsers/src/test/resources/test-documents/testHWP-v5-dist.hwp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-documents-v5-dist.hwp
rename to tika-parsers/src/test/resources/test-documents/testHWP-v5-dist.hwp
diff --git a/tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp b/tika-parsers/src/test/resources/test-documents/testHWP-v5b.hwp
similarity index 100%
rename from tika-parsers/src/test/resources/test-documents/test-documents-v5.hwp
rename to tika-parsers/src/test/resources/test-documents/testHWP-v5b.hwp


[tika] 01/02: Add more options to TikaTest

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit cdba490681684385e5f994fcf86cea6ba8082880
Author: TALLISON <ta...@apache.org>
AuthorDate: Tue Jul 23 10:56:39 2019 -0400

    Add more options to TikaTest
---
 tika-core/src/test/java/org/apache/tika/TikaTest.java | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 91e6dc7..fe8606b 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -242,6 +242,12 @@ public abstract class TikaTest {
         }
     }
 
+    protected List<Metadata> getRecursiveMetadata(Path path, Parser parser, boolean suppressException) throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(path)) {
+            return getRecursiveMetadata(tis, parser, new ParseContext(), new Metadata(), suppressException);
+        }
+    }
+
     protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException) throws Exception {
         try (TikaInputStream tis = TikaInputStream.get(p)) {
             return getRecursiveMetadata(tis, new ParseContext(), new Metadata(), suppressException);