You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/10/21 14:34:27 UTC

tika git commit: TIKA-2130

Repository: tika
Updated Branches:
  refs/heads/master bc7216ff7 -> 7ca105ef5


TIKA-2130


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/7ca105ef
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/7ca105ef
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/7ca105ef

Branch: refs/heads/master
Commit: 7ca105ef5716a23300da0b8939fcfa249b798532
Parents: bc7216f
Author: tballison <ta...@mitre.org>
Authored: Fri Oct 21 10:34:10 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Fri Oct 21 10:34:10 2016 -0400

----------------------------------------------------------------------
 .../tika/parser/microsoft/HSLFExtractor.java      |   9 +++++++++
 .../parser/microsoft/PowerPointParserTest.java    |   9 +++++++--
 .../testPPT_skipBadCompressedObject.ppt           | Bin 0 -> 228352 bytes
 3 files changed, 16 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/7ca105ef/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 63c9e3f..ed3bbeb 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -40,6 +40,7 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.TaggedIOException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -394,6 +395,14 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                                     stream, objID, objID,
                                     mediaType, xhtml, false);
                         }
+                    } catch (TaggedIOException e) {
+                        if ("incorrect data check".equals(e.getMessage())) {
+                            //TIKA-2130
+                            //some embedded objects can't be uncompressed correctly
+                            //swallow
+                        } else {
+                            throw e;
+                        }
                     }
                 }
             }

http://git-wip-us.apache.org/repos/asf/tika/blob/7ca105ef/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 41c5077..b61b484 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -266,6 +266,11 @@ public class PowerPointParserTest extends TikaTest {
     }
 
 
-
-
+    @Test
+    public void testSkippingBadCompressedObj() throws Exception {
+        //test file is from govdocs1: 258642.ppt
+        //TIKA-2130
+        XMLResult r = getXML("testPPT_skipBadCompressedObject.ppt");
+        assertContains("NASA Human", r.xml);
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/7ca105ef/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt
----------------------------------------------------------------------
diff --git a/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt b/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt
new file mode 100644
index 0000000..cc82e66
Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testPPT_skipBadCompressedObject.ppt differ