You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/11 13:25:17 UTC
[tika] branch main updated: TIKA-3754 -- allow easier customization of ImageGraphicsEngine
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 87a3468fc TIKA-3754 -- allow easier customization of ImageGraphicsEngine
87a3468fc is described below
commit 87a3468fc33de6c8e94736b3f09287a1ed9cf0aa
Author: tallison <ta...@apache.org>
AuthorDate: Wed May 11 09:25:02 2022 -0400
TIKA-3754 -- allow easier customization of ImageGraphicsEngine
---
.../tika/parser/pdf/image/ImageGraphicsEngine.java | 39 +++++++++++-----------
1 file changed, 20 insertions(+), 19 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
index a304fe614..8a9de293c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/image/ImageGraphicsEngine.java
@@ -77,26 +77,28 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
//We're currently copying images to byte[]. We should
//limit the length to avoid OOM on crafted files.
- private static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024;
+ protected static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024;
- private static final List<String> JPEG =
+ protected static final List<String> JPEG =
Arrays.asList(COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName());
- private static final List<String> JP2 = Collections.singletonList(COSName.JPX_DECODE.getName());
+ protected static final List<String> JP2 =
+ Collections.singletonList(COSName.JPX_DECODE.getName());
- private static final List<String> JB2 = Collections.singletonList(COSName.JBIG2_DECODE.getName());
+ protected static final List<String> JB2 =
+ Collections.singletonList(COSName.JBIG2_DECODE.getName());
final List<IOException> exceptions = new ArrayList<>();
- private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
- private final PDFParserConfig pdfParserConfig;
- private final Map<COSStream, Integer> processedInlineImages;
- private final AtomicInteger imageCounter;
- private final Metadata parentMetadata;
- private final XHTMLContentHandler xhtml;
- private final ParseContext parseContext;
- private final boolean extractInlineImageMetadataOnly;
+ protected final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+ protected final PDFParserConfig pdfParserConfig;
+ protected final Map<COSStream, Integer> processedInlineImages;
+ protected final AtomicInteger imageCounter;
+ protected final Metadata parentMetadata;
+ protected final XHTMLContentHandler xhtml;
+ protected final ParseContext parseContext;
+ protected final boolean extractInlineImageMetadataOnly;
//TODO: parameterize this ?
- private boolean useDirectJPEG = false;
+ protected boolean useDirectJPEG = false;
//TODO: this is an embarrassment of an initializer...fix
protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
@@ -202,7 +204,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
return null;
}
- private static void copyUpToMaxLength(InputStream is, OutputStream os)
+ protected static void copyUpToMaxLength(InputStream is, OutputStream os)
throws IOException, TikaException {
BoundedInputStream bis = new BoundedInputStream(MAX_IMAGE_LENGTH_BYTES, is);
IOUtils.copy(bis, os);
@@ -210,10 +212,9 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
throw new TikaMemoryLimitException(
"Image size is larger than allowed (" + MAX_IMAGE_LENGTH_BYTES + ")");
}
-
}
- private static boolean hasMasks(PDImage pdImage) throws IOException {
+ protected static boolean hasMasks(PDImage pdImage) throws IOException {
if (pdImage instanceof PDImageXObject) {
PDImageXObject ximg = (PDImageXObject) pdImage;
return ximg.getMask() != null || ximg.getSoftMask() != null;
@@ -429,7 +430,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
- private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata)
+ protected void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata)
throws IOException, SAXException {
if (pdImage instanceof PDImageXObject) {
PDMetadataExtractor
@@ -451,7 +452,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
}
- private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
+ protected String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
String suffix = pdImage.getSuffix();
if (suffix == null || suffix.equals("png")) {
@@ -480,7 +481,7 @@ public class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
return suffix;
}
- void handleCatchableIOE(IOException e) throws IOException {
+ protected void handleCatchableIOE(IOException e) throws IOException {
if (pdfParserConfig.isCatchIntermediateIOExceptions()) {
if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
e.getCause().getMessage().contains("Your document contained more than")) {