You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/07/27 01:33:20 UTC
tika git commit: TIKA-2040 - prevent permanent hang/oom on corrupt
chm file
Repository: tika
Updated Branches:
refs/heads/master f5b04b60c -> 71cb9363c
TIKA-2040 - prevent permanent hang/oom on corrupt chm file
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/71cb9363
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/71cb9363
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/71cb9363
Branch: refs/heads/master
Commit: 71cb9363c07839e68712edde4626d53aa928cc2a
Parents: f5b04b6
Author: tballison <ta...@mitre.org>
Authored: Tue Jul 26 21:33:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Tue Jul 26 21:33:10 2016 -0400
----------------------------------------------------------------------
CHANGES.txt | 2 ++
.../chm/accessor/ChmDirectoryListingSet.java | 11 +++++-----
.../apache/tika/parser/chm/core/ChmCommons.java | 5 ++++-
.../tika/parser/chm/core/ChmExtractor.java | 4 ++--
.../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 4 ++--
.../tika/parser/chm/TestChmExtractor.java | 21 ++++++++++++++++---
.../resources/test-documents/testChm_oom.chm | Bin 0 -> 4315 bytes
7 files changed, 34 insertions(+), 13 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index 9dd40b5..a994600 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
Release 1.14 - ???
+ * Prevent OOM/permanent hang on some corrupt CHM files (TIKA-2040).
+
* Upgrade ICU4J charset detection components to fix multithreading
bug (TIKA-2041).
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
index 9d0a2f0..e96426f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
@@ -16,16 +16,17 @@
*/
package org.apache.tika.parser.chm.accessor;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
+
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.chm.core.ChmCommons;
import org.apache.tika.parser.chm.core.ChmConstants;
import org.apache.tika.parser.chm.exception.ChmParsingException;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* Holds chm listing entries
*/
@@ -121,7 +122,7 @@ public class ChmDirectoryListingSet {
* chm itsp PMGLheader
*/
private void enumerateChmDirectoryListingList(ChmItsfHeader chmItsHeader,
- ChmItspHeader chmItspHeader) {
+ ChmItspHeader chmItspHeader) throws TikaException {
try {
int startPmgl = chmItspHeader.getIndex_head();
int stopPmgl = chmItspHeader.getUnknown_0024();
@@ -145,7 +146,7 @@ public class ChmDirectoryListingSet {
i=PMGLheader.getBlockNext();
dir_chunk = null;
}
- } catch (Exception e) {
+ } catch (ChmParsingException e) {
e.printStackTrace();
} finally {
setData(null);
@@ -196,7 +197,7 @@ public class ChmDirectoryListingSet {
*
* @param dir_chunk
*/
- private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException {
+ private void enumerateOneSegment(byte[] dir_chunk) throws ChmParsingException, TikaException {
// try {
if (dir_chunk != null) {
int header_len;
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
index cded7f2..a9d2454 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmCommons.java
@@ -332,11 +332,14 @@ public class ChmCommons {
/*
* This method is added because of supporting of Java 5
*/
- public static byte[] copyOfRange(byte[] original, int from, int to) {
+ public static byte[] copyOfRange(byte[] original, int from, int to) throws TikaException {
checkCopyOfRangeParams(original, from, to);
int newLength = to - from;
if (newLength < 0)
throw new IllegalArgumentException(from + " > " + to);
+ if (to > original.length) {
+ throw new TikaException("can't copy beyond array length");
+ }
byte[] copy = new byte[newLength];
System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
return copy;
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
index 454c1c4..c1e4495 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/core/ChmExtractor.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.parser.chm.core;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -35,8 +37,6 @@ import org.apache.tika.parser.chm.core.ChmCommons.EntryType;
import org.apache.tika.parser.chm.lzx.ChmBlockInfo;
import org.apache.tika.parser.chm.lzx.ChmLzxBlock;
-import static java.nio.charset.StandardCharsets.UTF_8;
-
/**
* Extracts text from chm file. Enumerates chm entries.
*/
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
index 9ca3595..b5ea37a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
@@ -846,12 +846,12 @@ public class ChmLzxBlock {
return content;
}
- public byte[] getContent(int startOffset, int endOffset) {
+ public byte[] getContent(int startOffset, int endOffset) throws TikaException {
return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
startOffset, endOffset) : new byte[1];
}
- public byte[] getContent(int start) {
+ public byte[] getContent(int start) throws TikaException {
return (getContent() != null) ? ChmCommons.copyOfRange(getContent(),
start, getContent().length) : new byte[1];
}
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java b/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
index c072db0..60d3e31 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/chm/TestChmExtractor.java
@@ -16,18 +16,23 @@
*/
package org.apache.tika.parser.chm;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
import java.io.ByteArrayInputStream;
import java.util.List;
+
+import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.chm.accessor.ChmDirectoryListingSet;
import org.apache.tika.parser.chm.accessor.DirectoryListingEntry;
import org.apache.tika.parser.chm.core.ChmExtractor;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
import org.junit.Before;
import org.junit.Test;
-public class TestChmExtractor {
+public class TestChmExtractor extends TikaTest {
private ChmExtractor chmExtractor = null;
@Before
@@ -60,4 +65,14 @@ public class TestChmExtractor {
assertEquals(TestParameters.VP_CHM_ENTITIES_NUMBER, count);
}
+ @Test
+ public void testOOMOnCorruptCHM() throws Exception {
+ try {
+ XMLResult r = getXML("testChm_oom.chm");
+ fail("should have thrown TikaException");
+ } catch (TikaException e) {
+ assertTrue("correct exception thrown", true);
+ }
+ }
+
}
http://git-wip-us.apache.org/repos/asf/tika/blob/71cb9363/tika-parsers/src/test/resources/test-documents/testChm_oom.chm
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testChm_oom.chm b/tika-parsers/src/test/resources/test-documents/testChm_oom.chm
new file mode 100644
index 0000000..675485b
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testChm_oom.chm differ