You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/10/21 14:34:27 UTC
tika git commit: TIKA-2130
Repository: tika
Updated Branches:
refs/heads/master bc7216ff7 -> 7ca105ef5
TIKA-2130
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7ca105ef
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7ca105ef
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7ca105ef
Branch: refs/heads/master
Commit: 7ca105ef5716a23300da0b8939fcfa249b798532
Parents: bc7216f
Author: tballison <ta...@mitre.org>
Authored: Fri Oct 21 10:34:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Oct 21 10:34:10 2016 -0400
----------------------------------------------------------------------
.../tika/parser/microsoft/HSLFExtractor.java | 9 +++++++++
.../parser/microsoft/PowerPointParserTest.java | 9 +++++++--
.../testPPT_skipBadCompressedObject.ppt | Bin 0 -> 228352 bytes
3 files changed, 16 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/7ca105ef/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 63c9e3f..ed3bbeb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -40,6 +40,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.TaggedIOException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -394,6 +395,14 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
stream, objID, objID,
mediaType, xhtml, false);
}
+ } catch (TaggedIOException e) {
+ if ("incorrect data check".equals(e.getMessage())) {
+ //TIKA-2130
+ //some embedded objects can't be uncompressed correctly
+ //swallow
+ } else {
+ throw e;
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7ca105ef/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 41c5077..b61b484 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -266,6 +266,11 @@ public class PowerPointParserTest extends TikaTest {
}
-
-
+ @Test
+ public void testSkippingBadCompressedObj() throws Exception {
+ //test file is from govdocs1: 258642.ppt
+ //TIKA-2130
+ XMLResult r = getXML("testPPT_skipBadCompressedObject.ppt");
+ assertContains("NASA Human", r.xml);
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/7ca105ef/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt
new file mode 100644
index 0000000..cc82e66
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt differ