You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/27 13:01:34 UTC

[tika] branch branch_1x updated: TIKA-3741 -- fix regression in handling embedded exceptions in ppt

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 88bff551f TIKA-3741 -- fix regression in handling embedded exceptions in ppt
88bff551f is described below

commit 88bff551fd05a3d7193291dcd3a98af56f38471a
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 27 09:01:18 2022 -0400

    TIKA-3741 -- fix regression in handling embedded exceptions in ppt
---
 .../tika/parser/microsoft/HSLFExtractor.java       | 67 ++++++++++++----------
 .../parser/microsoft/PowerPointParserTest.java     |  8 +--
 2 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
index 356c47e6a..50a19938d 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/HSLFExtractor.java
@@ -503,44 +503,49 @@ public class HSLFExtractor extends AbstractPOIFSExtractor {
                     InputStream dataStream = null;
                     try {
                         dataStream = data.getInputStream();
+                    } catch (SecurityException e) {
+                        throw e;
                     } catch (Exception e) {
                         EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                         continue;
                     }
-                    try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
-                        String mediaType = null;
-                        if ("Excel.Chart.8".equals(oleShape.getProgId())) {
-                            mediaType = "application/vnd.ms-excel";
-                        } else {
-                            MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
-                            mediaType = mt.toString();
-                        }
-                        if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
-                                || mediaType.equals("application/x-tika-msoffice")) {
-                            POIFSFileSystem poifs = null;
-
-                            try {
-                                poifs = new POIFSFileSystem(new CloseShieldInputStream(stream));
-                            } catch (RuntimeException e) {
-                                throw new IOExceptionWithCause(e);
-                            }
-                            try {
-                                handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
-                            } finally {
-                                if (poifs != null) {
-                                    poifs.close();
-                                }
-                            }
-                        } else {
-                            handleEmbeddedResource(
-                                    stream, objID, objID,
-                                    mediaType, xhtml, false);
-                        }
-                    } catch (IOException e) {
-                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+                    handleData(objID, oleShape.getProgId(), dataStream, xhtml);
+                }
+            }
+        }
+    }
+
+    private void handleData(String objID, String progId, InputStream dataStream,
+                            XHTMLContentHandler xhtml) {
+
+        try (TikaInputStream stream = TikaInputStream.get(dataStream)) {
+            String mediaType = null;
+            if ("Excel.Chart.8".equals(progId)) {
+                mediaType = "application/vnd.ms-excel";
+            } else {
+                MediaType mt = getTikaConfig().getDetector().detect(stream, new Metadata());
+                mediaType = mt.toString();
+            }
+            if (mediaType.equals("application/x-tika-msoffice-embedded; format=comp_obj")
+                    || mediaType.equals("application/x-tika-msoffice")) {
+                POIFSFileSystem poifs = new POIFSFileSystem(new CloseShieldInputStream(stream));
+
+                try {
+                    handleEmbeddedOfficeDoc(poifs.getRoot(), objID, xhtml);
+                } finally {
+                    if (poifs != null) {
+                        poifs.close();
                     }
                 }
+            } else {
+                handleEmbeddedResource(
+                        stream, objID, objID,
+                        mediaType, xhtml, false);
             }
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+            EmbeddedDocumentUtil.recordException(e, parentMetadata);
         }
     }
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
index 658c0d572..ce60154bc 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/PowerPointParserTest.java
@@ -327,15 +327,15 @@ public class PowerPointParserTest extends TikaTest {
         XMLResult r = getXML("testPPT_skipBadCompressedObject.ppt");
         assertContains("NASA Human", r.xml);
         assertEquals(2,
-                r.metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM).length);
+                r.metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
         assertContains("incorrect data check",
-                r.metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
+                r.metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
 
         List<Metadata> metadataList = getRecursiveMetadata("testPPT_skipBadCompressedObject.ppt");
         assertEquals(2,
-                metadataList.get(0).getValues(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM).length);
+                metadataList.get(0).getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
         assertContains("incorrect data check",
-                metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
+                metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
     }
 
     @Test(expected = EncryptedDocumentException.class)