You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/12 17:13:11 UTC
[tika] branch branch_1x updated: TIKA-3041 -- extract inline images
that rely on the DCT filter.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new da79e31 TIKA-3041 -- extract inline images that rely on the DCT filter.
da79e31 is described below
commit da79e3105223fe897c36d6f5d30000fc602ae3a1
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 12 11:50:34 2020 -0500
TIKA-3041 -- extract inline images that rely on the DCT filter.
# Conflicts:
# tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
# tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
---
CHANGES.txt | 1 +
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 2 +-
.../tika/parser/pdf/ImageGraphicsEngine.java | 423 +++++++++++++++++++++
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 198 ++--------
.../org/apache/tika/parser/pdf/PDFParserTest.java | 33 +-
5 files changed, 479 insertions(+), 178 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a002db3..800bf54 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
Release 1.24 - ???
+ * Extract inline images that rely on the DCT filter from PDFs (TIKA-3041).
* Upgrade to PDFBox 2.0.18 (TIKA-3021).
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index ea3b173..8f55086 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -310,7 +310,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
void handleCatchableIOE(IOException e) throws IOException {
- if (config.isCatchIntermediateIOExceptions()) {
+ if (config.getCatchIntermediateIOExceptions()) {
if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
e.getCause().getMessage().contains("Your document contained more than")) {
//TODO -- is there a cleaner way of checking for:
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
new file mode 100644
index 0000000..2625ddb
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -0,0 +1,423 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.filter.MissingImageReaderException;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
+import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
+import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
+import org.apache.pdfbox.pdmodel.graphics.color.PDPattern;
+import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern;
+import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
+import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask;
+import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.awt.geom.Point2D;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Copied nearly verbatim from PDFBox
+ */
+class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
+
+
+ private static final List<String> JPEG = Arrays.asList(
+ COSName.DCT_DECODE.getName(),
+ COSName.DCT_DECODE_ABBREVIATION.getName());
+
+
+ private static final List<String> JP2 =
+ Arrays.asList(COSName.JPX_DECODE.getName());
+
+ private static final List<String> JB2 = Arrays.asList(
+ COSName.JBIG2_DECODE.getName());
+
+ //TODO: parameterize this ?
+ private boolean useDirectJPEG = false;
+
+ final List<IOException> exceptions = new ArrayList<>();
+ private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+ private final PDFParserConfig pdfParserConfig;
+ private final Map<COSStream, Integer> processedInlineImages;
+ private final AtomicInteger imageCounter;
+ private final Metadata parentMetadata;
+ private final XHTMLContentHandler xhtml;
+ private final ParseContext parseContext;
+
+ //TODO: this is an embarrassment of an initializer...fix
+ protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+ PDFParserConfig pdfParserConfig, Map<COSStream, Integer> processedInlineImages,
+ AtomicInteger imageCounter, XHTMLContentHandler xhtml, Metadata parentMetadata,
+ ParseContext parseContext) {
+ super(page);
+ this.embeddedDocumentExtractor = embeddedDocumentExtractor;
+ this.pdfParserConfig = pdfParserConfig;
+ this.processedInlineImages = processedInlineImages;
+ this.imageCounter = imageCounter;
+ this.xhtml = xhtml;
+ this.parentMetadata = parentMetadata;
+ this.parseContext = parseContext;
+ }
+
+ void run() throws IOException {
+ PDPage page = getPage();
+
+ //TODO: is there a better way to do this rather than reprocessing the page
+ //can we process the text and images in one go?
+ processPage(page);
+ PDResources res = page.getResources();
+ if (res == null) {
+ return;
+ }
+
+ for (COSName name : res.getExtGStateNames()) {
+ PDSoftMask softMask = res.getExtGState(name).getSoftMask();
+
+ if (softMask != null) {
+ try {
+ PDTransparencyGroup group = softMask.getGroup();
+
+ if (group != null) {
+ // PDFBOX-4327: without this line NPEs will occur
+ res.getExtGState(name).copyIntoGraphicsState(getGraphicsState());
+
+ processSoftMask(group);
+ }
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+ }
+ }
+
+ @Override
+ public void drawImage(PDImage pdImage) throws IOException {
+ int imageNumber = 0;
+ if (pdImage instanceof PDImageXObject) {
+ if (pdImage.isStencil()) {
+ processColor(getGraphicsState().getNonStrokingColor());
+ }
+
+ PDImageXObject xobject = (PDImageXObject) pdImage;
+ Integer cachedNumber = processedInlineImages.get(xobject.getCOSObject());
+ if (cachedNumber != null && pdfParserConfig.getExtractUniqueInlineImagesOnly()) {
+ // skip duplicate image
+ return;
+ }
+ if (cachedNumber == null) {
+ imageNumber = imageCounter.getAndIncrement();
+ processedInlineImages.put(xobject.getCOSObject(), imageNumber);
+ }
+ } else {
+ imageNumber = imageCounter.getAndIncrement();
+ }
+ //TODO: should we use the hash of the PDImage to check for seen
+ //For now, we're relying on the cosobject, but this could lead to
+ //duplicates if the pdImage is not a PDImageXObject?
+ try {
+ processImage(pdImage, imageNumber);
+ } catch (SAXException e) {
+ throw new IOExceptionWithCause(e);
+ } catch (IOException e) {
+ handleCatchableIOE(e);
+ }
+ }
+
+ @Override
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
+ throws IOException {
+
+ }
+
+ @Override
+ public void clip(int windingRule) throws IOException {
+
+ }
+
+ @Override
+ public void moveTo(float x, float y) throws IOException {
+
+ }
+
+ @Override
+ public void lineTo(float x, float y) throws IOException {
+
+ }
+
+ @Override
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
+ throws IOException {
+
+ }
+
+ @Override
+ public Point2D getCurrentPoint() throws IOException {
+ return new Point2D.Float(0, 0);
+ }
+
+ @Override
+ public void closePath() throws IOException {
+
+ }
+
+ @Override
+ public void endPath() throws IOException {
+
+ }
+
+ @Override
+ protected void showGlyph(Matrix textRenderingMatrix,
+ PDFont font,
+ int code,
+ String unicode,
+ Vector displacement) throws IOException {
+
+ RenderingMode renderingMode = getGraphicsState().getTextState().getRenderingMode();
+ if (renderingMode.isFill()) {
+ processColor(getGraphicsState().getNonStrokingColor());
+ }
+
+ if (renderingMode.isStroke()) {
+ processColor(getGraphicsState().getStrokingColor());
+ }
+ }
+
+ @Override
+ public void strokePath() throws IOException {
+ processColor(getGraphicsState().getStrokingColor());
+ }
+
+ @Override
+ public void fillPath(int windingRule) throws IOException {
+ processColor(getGraphicsState().getNonStrokingColor());
+ }
+
+ @Override
+ public void fillAndStrokePath(int windingRule) throws IOException {
+ processColor(getGraphicsState().getNonStrokingColor());
+ }
+
+ @Override
+ public void shadingFill(COSName shadingName) throws IOException {
+
+ }
+
+ // find out if it is a tiling pattern, then process that one
+ private void processColor(PDColor color) throws IOException {
+ if (color.getColorSpace() instanceof PDPattern) {
+ PDPattern pattern = (PDPattern) color.getColorSpace();
+ PDAbstractPattern abstractPattern = pattern.getPattern(color);
+
+ if (abstractPattern instanceof PDTilingPattern) {
+ processTilingPattern((PDTilingPattern) abstractPattern, null, null);
+ }
+ }
+ }
+
+ private void processImage(PDImage image, int imageNumber) throws IOException, SAXException {
+ //this is the metadata for this particular image
+ Metadata metadata = new Metadata();
+ String suffix = getSuffix(image, metadata);
+ String fileName = "image" + imageNumber + "." + suffix;
+
+
+ AttributesImpl attr = new AttributesImpl();
+ attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+ attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+ xhtml.startElement("img", attr);
+ xhtml.endElement("img");
+
+
+ metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+ if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
+ ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+ if (image instanceof PDImageXObject) {
+ PDMetadataExtractor.extract(((PDImageXObject) image).getMetadata(),
+ metadata, parseContext);
+ }
+ //extract the metadata contained outside of the image
+ try {
+ writeToBuffer(image, suffix, useDirectJPEG, buffer);
+ } catch (MissingImageReaderException e) {
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
+ return;
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+ return;
+ }
+ try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
+ embeddedDocumentExtractor.parseEmbedded(
+ embeddedIs,
+ new EmbeddedContentHandler(xhtml),
+ metadata, false);
+ }
+ }
+
+ }
+
+ private String getSuffix(PDImage image, Metadata metadata) {
+ String suffix = image.getSuffix();
+
+ if (suffix == null || suffix.equals("png")) {
+ metadata.set(Metadata.CONTENT_TYPE, "image/png");
+ suffix = "png";
+ } else if (suffix.equals("jpg")) {
+ metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+ } else if (suffix.equals("tiff")) {
+ metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+ suffix = "tif";
+ } else if (suffix.equals("jpx")) {
+ metadata.set(Metadata.CONTENT_TYPE, "image/jp2");
+ } else if (suffix.equals("jb2")) {
+ metadata.set(
+ Metadata.CONTENT_TYPE, "image/x-jbig2");
+ } else {
+ //TODO: determine if we need to add more image types
+// throw new RuntimeException("EXTEN:" + extension);
+ }
+ return suffix;
+ }
+
+ void handleCatchableIOE(IOException e) throws IOException {
+ if (pdfParserConfig.getCatchIntermediateIOExceptions()) {
+ if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
+ e.getCause().getMessage().contains("Your document contained more than")) {
+ //TODO -- is there a cleaner way of checking for:
+ // WriteOutContentHandler.WriteLimitReachedException?
+ throw e;
+ }
+
+ String msg = e.getMessage();
+ if (msg == null) {
+ msg = "IOException, no message";
+ }
+ parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+ exceptions.add(e);
+ } else {
+ throw e;
+ }
+ }
+
+ List<IOException> getExceptions() {
+ return exceptions;
+ }
+
+ //nearly directly copied from PDFBox ExtractImages
+ private static void writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG, OutputStream out)
+ throws IOException {
+
+ BufferedImage image = pdImage.getImage();
+ if (image != null) {
+ if ("jpg".equals(suffix)) {
+
+ String colorSpaceName = pdImage.getColorSpace().getName();
+ if (directJPEG ||
+ !hasMasks(pdImage) &&
+ (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
+ PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
+ // RGB or Gray colorspace: get and write the unmodified JPEG stream
+ InputStream data = pdImage.createInputStream(JPEG);
+ IOUtils.copy(data, out);
+ IOUtils.closeQuietly(data);
+ } else {
+ // for CMYK and other "unusual" colorspaces, the JPEG will be converted
+ ImageIOUtil.writeImage(image, suffix, out);
+ }
+
+ } else if ("jp2".equals(suffix)) {
+ String colorSpaceName = pdImage.getColorSpace().getName();
+ if (directJPEG ||
+ !hasMasks(pdImage) &&
+ (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
+ PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
+ // RGB or Gray colorspace: get and write the unmodified JPEG2000 stream
+ InputStream data = pdImage.createInputStream(JP2);
+ IOUtils.copy(data, out);
+ IOUtils.closeQuietly(data);
+ } else {
+ // for CMYK and other "unusual" colorspaces, the image will be converted
+ ImageIOUtil.writeImage(image, "jpeg2000", out);
+ }
+ } else if ("tiff".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE)) {
+ // CCITT compressed images can have a different colorspace, but this one is B/W
+ // This is a bitonal image, so copy to TYPE_BYTE_BINARY
+ // so that a G4 compressed TIFF image is created by ImageIOUtil.writeImage()
+ int w = image.getWidth();
+ int h = image.getHeight();
+ BufferedImage bitonalImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
+ // copy image the old fashioned way - ColorConvertOp is slower!
+ for (int y = 0; y < h; y++) {
+ for (int x = 0; x < w; x++) {
+ bitonalImage.setRGB(x, y, image.getRGB(x, y));
+ }
+ }
+ ImageIOUtil.writeImage(bitonalImage, suffix, out);
+ } else if ("jb2".equals(suffix)) {
+ InputStream data = pdImage.createInputStream(JB2);
+ org.apache.pdfbox.io.IOUtils.copy(data, out);
+ org.apache.pdfbox.io.IOUtils.closeQuietly(data);
+ } else {
+ ImageIOUtil.writeImage(image, suffix, out);
+ }
+ }
+ out.flush();
+ }
+
+ private static boolean hasMasks(PDImage pdImage) throws IOException {
+ if (pdImage instanceof PDImageXObject) {
+ PDImageXObject ximg = (PDImageXObject) pdImage;
+ return ximg.getMask() != null || ximg.getSoftMask() != null;
+ }
+ return false;
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 4551e7b..4ed0d90 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,47 +16,29 @@
*/
package org.apache.tika.parser.pdf;
+import java.io.IOException;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.filter.MissingImageReaderException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
-import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -66,16 +48,6 @@ import java.util.Set;
class PDF2XHTML extends AbstractPDF2XHTML {
- private static final List<String> JPEG = Arrays.asList(
- COSName.DCT_DECODE.getName(),
- COSName.DCT_DECODE_ABBREVIATION.getName());
-
- private static final List<String> JP2 =
- Arrays.asList(COSName.JPX_DECODE.getName());
-
- private static final List<String> JB2 = Arrays.asList(
- COSName.JBIG2_DECODE.getName());
-
/**
* This keeps track of the pdf object ids for inline
* images that have been processed.
@@ -88,7 +60,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
* TIKA-1742, we're limiting the export to one image per page.
*/
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
- private int inlineImageCounter = 0;
+ private AtomicInteger inlineImageCounter = new AtomicInteger(0);
private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config)
throws IOException {
@@ -162,7 +134,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
try {
writeParagraphEnd();
try {
- extractImages(page.getResources(), new HashSet<COSBase>());
+ extractImages(page);
} catch (IOException e) {
handleCatchableIOE(e);
}
@@ -174,148 +146,22 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
}
- private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
- if (resources == null || config.getExtractInlineImages() == false) {
- return;
- }
-
- for (COSName name : resources.getXObjectNames()) {
-
- PDXObject object = null;
- try {
- object = resources.getXObject(name);
- } catch (MissingImageReaderException e) {
- EmbeddedDocumentUtil.recordException(e, metadata);
- continue;
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- continue;
- }
- processImageObject(object, seenThisPage);
- }
- }
-
- private void processImageObject(PDXObject object, Set<COSBase> seenThisPage) throws SAXException, IOException {
- if (object == null) {
+ private void extractImages(PDPage page) throws SAXException, IOException {
+ if (config.getExtractInlineImages() == false) {
return;
}
- COSStream cosStream = object.getCOSObject();
- if (seenThisPage.contains(cosStream)) {
- //avoid infinite recursion TIKA-1742
- return;
- }
- seenThisPage.add(cosStream);
-
- if (object instanceof PDFormXObject) {
- extractImages(((PDFormXObject) object).getResources(), seenThisPage);
- } else if (object instanceof PDImageXObject) {
-
- PDImageXObject image = (PDImageXObject) object;
-
- Metadata embeddedMetadata = new Metadata();
- String extension = image.getSuffix();
-
- if (extension == null || extension.equals("png")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
- extension = "png";
- } else if (extension.equals("jpg")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
- } else if (extension.equals("tiff")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
- extension = "tif";
- } else if (extension.equals("jpx")) {
- embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
- } else if (extension.equals("jb2")) {
- embeddedMetadata.set(
- Metadata.CONTENT_TYPE, "image/x-jbig2");
- } else {
- //TODO: determine if we need to add more image types
-// throw new RuntimeException("EXTEN:" + extension);
- }
- Integer imageNumber = processedInlineImages.get(cosStream);
- if (imageNumber == null) {
- imageNumber = inlineImageCounter++;
- }
- String fileName = "image" + imageNumber + "."+extension;
- embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-
- // Output the img tag
- AttributesImpl attr = new AttributesImpl();
- attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
- attr.addAttribute("", "alt", "alt", "CDATA", fileName);
- xhtml.startElement("img", attr);
- xhtml.endElement("img");
-
- //Do we only want to process unique COSObject ids?
- //If so, have we already processed this one?
- if (config.getExtractUniqueInlineImagesOnly() == true) {
- if (processedInlineImages.containsKey(cosStream)) {
- return;
- }
- processedInlineImages.put(cosStream, imageNumber);
- }
-
- embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
- if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
- ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- try {
- //extract the metadata contained outside of the image
- PDMetadataExtractor.extract(image.getMetadata(),
- embeddedMetadata, context);
- try {
- writeToBuffer(image, extension, buffer);
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- return;
- }
- try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
- embeddedDocumentExtractor.parseEmbedded(
- embeddedIs,
- new EmbeddedContentHandler(xhtml),
- embeddedMetadata, false);
- }
- } catch (IOException e) {
- handleCatchableIOE(e);
- }
- }
- }
- }
-
- //nearly directly copied from PDFBox ExtractImages
- private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out)
- throws IOException {
-
- BufferedImage image = pdImage.getImage();
- if (image != null) {
- if ("jpg".equals(suffix)) {
- String colorSpaceName = pdImage.getColorSpace().getName();
- //TODO: figure out if we want directJPEG as a configuration
- //previously: if (directJPeg || PDDeviceGray....
- if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
- PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) {
- // RGB or Gray colorspace: get and write the unmodifiedJPEG stream
- InputStream data = pdImage.getStream().createInputStream(JPEG);
- org.apache.pdfbox.io.IOUtils.copy(data, out);
- org.apache.pdfbox.io.IOUtils.closeQuietly(data);
- } else {
- // for CMYK and other "unusual" colorspaces, the JPEG will be converted
- ImageIOUtil.writeImage(image, suffix, out);
- }
- } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) {
- InputStream data = pdImage.createInputStream(JP2);
- org.apache.pdfbox.io.IOUtils.copy(data, out);
- org.apache.pdfbox.io.IOUtils.closeQuietly(data);
- } else if ("jb2".equals(suffix)) {
- InputStream data = pdImage.createInputStream(JB2);
- org.apache.pdfbox.io.IOUtils.copy(data, out);
- org.apache.pdfbox.io.IOUtils.closeQuietly(data);
- } else{
- ImageIOUtil.writeImage(image, suffix, out);
+ ImageGraphicsEngine engine = new ImageGraphicsEngine(page, embeddedDocumentExtractor,
+ config, processedInlineImages, inlineImageCounter, xhtml, metadata, context);
+ engine.run();
+ List<IOException> engineExceptions = engine.getExceptions();
+ if (engineExceptions.size() > 0) {
+ IOException first = engineExceptions.remove(0);
+ if (config.getCatchIntermediateIOExceptions()) {
+ exceptions.addAll(engineExceptions);
}
+ throw first;
}
- out.flush();
}
@Override
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8709451..fa6e962 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -31,6 +31,8 @@ import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
@@ -68,6 +70,7 @@ import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.junit.AfterClass;
import org.junit.BeforeClass;
+import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -687,7 +690,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("Invalid width.", "352", metadatas.get(1).get("width"));
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
- assertEquals("image0.jb2",
+ assertEquals("image0.jb2",
metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("x-jbig2").toString(),
metadatas.get(1).get(Metadata.CONTENT_TYPE));
@@ -1476,6 +1479,34 @@ public class PDFParserTest extends TikaTest {
assertEquals(120, unmappedUnicodeChars[15]);
}
+
+ @Test //TIKA-3041
+ @Ignore("turn back on if we add file from PDFBOX-52")
+ public void testPDFBox52() throws Exception {
+ PDFParserConfig config = new PDFParserConfig();
+ config.setExtractInlineImages(true);
+ config.setExtractUniqueInlineImagesOnly(false);
+ ParseContext context = new ParseContext();
+ context.set(PDFParserConfig.class, config);
+
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_PDFBOX-52.pdf", context);
+ int max = 0;
+ Matcher matcher = Pattern.compile("image(\\d+)").matcher("");
+ for (Metadata m : metadataList) {
+ String n = m.get(Metadata.RESOURCE_NAME_KEY);
+
+ if (n != null && matcher.reset(n).find()) {
+ int i = Integer.parseInt(matcher.group(1));
+ if (i > max) {
+ max = i;
+ }
+ }
+ }
+ assertEquals(37, metadataList.size());
+ assertEquals(35, max);
+ }
+
+
/**
* Simple class to count end of document events. If functionality is useful,
* move to org.apache.tika in src/test