You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/11 13:25:17 UTC

[tika] branch main updated: TIKA-3754 -- allow easier customization of ImageGraphicsEngine

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 87a3468fc TIKA-3754 -- allow easier customization of ImageGraphicsEngine
87a3468fc is described below

commit 87a3468fc33de6c8e94736b3f09287a1ed9cf0aa
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 11 09:25:02 2022 -0400

    TIKA-3754 -- allow easier customization of ImageGraphicsEngine
---
 .../tika/parser/pdf/image/ImageGraphicsEngine.java | 39 +++++++++++-----------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index a304fe614..8a9de293c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -77,26 +77,28 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
 
     //We're currently copying images to byte[].  We should
     //limit the length to avoid OOM on crafted files.
-    private static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024;
+    protected static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024;
 
-    private static final List<String> JPEG =
+    protected static final List<String> JPEG =
             Arrays.asList(COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName());
 
 
-    private static final List<String> JP2 = Collections.singletonList(COSName.JPX_DECODE.getName());
+    protected static final List<String> JP2 =
+            Collections.singletonList(COSName.JPX_DECODE.getName());
 
-    private static final List<String> JB2 = Collections.singletonList(COSName.JBIG2_DECODE.getName());
+    protected static final List<String> JB2 =
+            Collections.singletonList(COSName.JBIG2_DECODE.getName());
     final List<IOException> exceptions = new ArrayList<>();
-    private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
-    private final PDFParserConfig pdfParserConfig;
-    private final Map<COSStream, Integer> processedInlineImages;
-    private final AtomicInteger imageCounter;
-    private final Metadata parentMetadata;
-    private final XHTMLContentHandler xhtml;
-    private final ParseContext parseContext;
-    private final boolean extractInlineImageMetadataOnly;
+    protected final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+    protected final PDFParserConfig pdfParserConfig;
+    protected final Map<COSStream, Integer> processedInlineImages;
+    protected final AtomicInteger imageCounter;
+    protected final Metadata parentMetadata;
+    protected final XHTMLContentHandler xhtml;
+    protected final ParseContext parseContext;
+    protected final boolean extractInlineImageMetadataOnly;
     //TODO: parameterize this ?
-    private boolean useDirectJPEG = false;
+    protected boolean useDirectJPEG = false;
 
     //TODO: this is an embarrassment of an initializer...fix
     protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
@@ -202,7 +204,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         return null;
     }
 
-    private static void copyUpToMaxLength(InputStream is, OutputStream os)
+    protected static void copyUpToMaxLength(InputStream is, OutputStream os)
             throws IOException, TikaException {
         BoundedInputStream bis = new BoundedInputStream(MAX_IMAGE_LENGTH_BYTES, is);
         IOUtils.copy(bis, os);
@@ -210,10 +212,9 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
             throw new TikaMemoryLimitException(
                     "Image size is larger than allowed (" + MAX_IMAGE_LENGTH_BYTES + ")");
         }
-
     }
 
-    private static boolean hasMasks(PDImage pdImage) throws IOException {
+    protected static boolean hasMasks(PDImage pdImage) throws IOException {
         if (pdImage instanceof PDImageXObject) {
             PDImageXObject ximg = (PDImageXObject) pdImage;
             return ximg.getMask() != null || ximg.getSoftMask() != null;
@@ -429,7 +430,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
 
     }
 
-    private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata)
+    protected void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata)
             throws IOException, SAXException {
         if (pdImage instanceof PDImageXObject) {
             PDMetadataExtractor
@@ -451,7 +452,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         }
     }
 
-    private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
+    protected String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
         String suffix = pdImage.getSuffix();
 
         if (suffix == null || suffix.equals("png")) {
@@ -480,7 +481,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
         return suffix;
     }
 
-    void handleCatchableIOE(IOException e) throws IOException {
+    protected void handleCatchableIOE(IOException e) throws IOException {
         if (pdfParserConfig.isCatchIntermediateIOExceptions()) {
             if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
                     e.getCause().getMessage().contains("Your document contained more than")) {