You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/14 01:54:40 UTC

[tika] 01/03: improve robustness of image processing in PDFs

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 8028a006f85cf8d72cf132829a212daf80052bce
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 12 16:41:50 2021 -0500

    improve robustness of image processing in PDFs
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 25 ++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 53b7ee4..3007bfe 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -103,6 +103,7 @@ import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
@@ -448,12 +449,24 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         try (TemporaryResources tmp = new TemporaryResources()) {
 
             int dpi = config.getOcrDPI();
-            BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
-            Path tmpFile = tmp.createTempFile();
-            try (OutputStream os = Files.newOutputStream(tmpFile)) {
-                //TODO: get output format from TesseractConfig
-                ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
-                        os, dpi, config.getOcrImageQuality());
+            Path tmpFile = null;
+            try {
+                BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+                tmpFile = tmp.createTempFile();
+                try (OutputStream os = Files.newOutputStream(tmpFile)) {
+                    //TODO: get output format from TesseractConfig
+                    ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
+                            os, dpi, config.getOcrImageQuality());
+                }
+            } catch (SecurityException e) {
+                //throw SecurityExceptions immediately
+                throw e;
+            } catch (IOException|RuntimeException e) {
+                //image rendering can throw a variety of runtime exceptions, not just IOExceptions...
+                //need to have a wide catch
+                metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
+                        ExceptionUtils.getStackTrace(e));
+                return;
             }
             try (InputStream is = TikaInputStream.get(tmpFile)) {
                 metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, ocrImageMediaType.toString());