You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/14 01:54:40 UTC
[tika] 01/03: improve robustness of image processing in PDFs
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 8028a006f85cf8d72cf132829a212daf80052bce
Author: tallison <ta...@apache.org>
AuthorDate: Fri Mar 12 16:41:50 2021 -0500
improve robustness of image processing in PDFs
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 25 ++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 53b7ee4..3007bfe 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -103,6 +103,7 @@ import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -448,12 +449,24 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try (TemporaryResources tmp = new TemporaryResources()) {
int dpi = config.getOcrDPI();
- BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
- Path tmpFile = tmp.createTempFile();
- try (OutputStream os = Files.newOutputStream(tmpFile)) {
- //TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
- os, dpi, config.getOcrImageQuality());
+ Path tmpFile = null;
+ try {
+ BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+ tmpFile = tmp.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, config.getOcrImageFormatName(),
+ os, dpi, config.getOcrImageQuality());
+ }
+ } catch (SecurityException e) {
+ //throw SecurityExceptions immediately
+ throw e;
+ } catch (IOException|RuntimeException e) {
+ //image rendering can throw a variety of runtime exceptions, not just IOExceptions...
+ //need to have a wide catch
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
+ ExceptionUtils.getStackTrace(e));
+ return;
}
try (InputStream is = TikaInputStream.get(tmpFile)) {
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, ocrImageMediaType.toString());