You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/27 01:33:20 UTC

tika git commit: TIKA-2040 - prevent permanent hang/oom on corrupt chm file

Repository: tika
Updated Branches:
  refs/heads/master f5b04b60c -> 71cb9363c


TIKA-2040 - prevent permanent hang/oom on corrupt chm file


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/71cb9363
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/71cb9363
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/71cb9363

Branch: refs/heads/master
Commit: 71cb9363c07839e68712edde4626d53aa928cc2a
Parents: f5b04b6
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 21:33:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 21:33:10 2016 -0400

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 ++
 .../chm/accessor/ChmDirectoryListingSet.java    |  11 +++++-----
 .../apache/tika/parser/chm/core/ChmCommons.java |   5 ++++-
 .../tika/parser/chm/core/ChmExtractor.java      |   4 ++--
 .../apache/tika/parser/chm/lzx/ChmLzxBlock.java |   4 ++--
 .../tika/parser/chm/TestChmExtractor.java       |  21 ++++++++++++++++---
 .../resources/test-documents/testChm_oom.chm    | Bin 0 -> 4315 bytes
 7 files changed, 34 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 9dd40b5..a994600 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.14 - ???
 
+  * Prevent OOM/permanent hang on some corrupt CHM files (TIKA-2040).
+
   * Upgrade ICU4J charset detection components to fix multithreading
     bug (TIKA-2041).
 

http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
index 9d0a2f0..e96426f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
@@ -16,16 +16,17 @@
  */
 package org.apache.tika.parser.chm.accessor;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.math.BigInteger;
 import java.util.ArrayList;
 import java.util.List;
+
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.chm.core.ChmCommons;
 import org.apache.tika.parser.chm.core.ChmConstants;
 import org.apache.tika.parser.chm.exception.ChmParsingException;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 /**
  * Holds chm listing entries
  */
@@ -121,7 +122,7 @@ public class ChmDirectoryListingSet {
      *            chm itsp PMGLheader
      */
     private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
-            ChmItspHeader chmItspHeader) {
+            ChmItspHeader chmItspHeader) throws TikaException {
         try {
             int startPmgl = chmItspHeader.getIndex_head();
             int stopPmgl = chmItspHeader.getUnknown_0024();
@@ -145,7 +146,7 @@ public class ChmDirectoryListingSet {
                 i=PMGLheader.getBlockNext();
                 dir_chunk = null;
             }
-        } catch (Exception e) {
+        } catch (ChmParsingException e) {
             e.printStackTrace();
         } finally {
             setData(null);
@@ -196,7 +197,7 @@ public class ChmDirectoryListingSet {
      * 
      * @param dir_chunk
      */
-    private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
+    private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException, TikaException {
 //        try {
             if (dir_chunk != null) {
                 int header_len;

http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
index cded7f2..a9d2454 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
@@ -332,11 +332,14 @@ public class ChmCommons {
     /*
      * This method is added because of supporting of Java 5
      */
-    public static byte[] copyOfRange(byte[] original, int from, int to) {
+    public static byte[] copyOfRange(byte[] original, int from, int to) throws TikaException {
         checkCopyOfRangeParams(original, from, to);
         int newLength = to - from;
         if (newLength < 0)
             throw new IllegalArgumentException(from + " > " + to);
+        if (to > original.length) {
+            throw new TikaException("can't copy beyond array length");
+        }
         byte[] copy = new byte[newLength];
         System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
         return copy;

http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
index 454c1c4..c1e4495 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
@@ -16,6 +16,8 @@
  */
 package org.apache.tika.parser.chm.core;
 
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -35,8 +37,6 @@ import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
 import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
 import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
 
-import static java.nio.charset.StandardCharsets.UTF_8;
-
 /**
  * Extracts text from chm file. Enumerates chm entries.
  */

http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
index 9ca3595..b5ea37a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
@@ -846,12 +846,12 @@ public class ChmLzxBlock {
         return content;
     }
 
-    public byte[] getContent(int startOffset, int endOffset) {
+    public byte[] getContent(int startOffset, int endOffset) throws TikaException {
         return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
                 startOffset, endOffset) : new byte[1];
     }
 
-    public byte[] getContent(int start) {
+    public byte[] getContent(int start) throws TikaException {
         return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
                 start, getContent().length) : new byte[1];
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java b/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
index c072db0..60d3e31 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
@@ -16,18 +16,23 @@
  */
 package org.apache.tika.parser.chm;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
 import java.io.ByteArrayInputStream;
 import java.util.List;
+
+import org.apache.tika.TikaTest;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
 import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
 import org.apache.tika.parser.chm.core.ChmExtractor;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
 import org.junit.Before;
 import org.junit.Test;
 
-public class TestChmExtractor {
+public class TestChmExtractor extends TikaTest {
     private ChmExtractor chmExtractor = null;
 
     @Before
@@ -60,4 +65,14 @@ public class TestChmExtractor {
         assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
     }
 
+    @Test
+    public void testOOMOnCorruptCHM() throws Exception {
+        try {
+            XMLResult r = getXML("testChm_oom.chm");
+            fail("should have thrown TikaException");
+        } catch (TikaException e) {
+            assertTrue("correct exception thrown", true);
+        }
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/test/resources/test-documents/testChm_oom.chm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testChm_oom.chm b/tika-parsers/src/test/resources/test-documents/testChm_oom.chm
new file mode 100644
index 0000000..675485b
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testChm_oom.chm differ