You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/02/12 17:13:11 UTC

[tika] branch branch_1x updated: TIKA-3041 -- extract inline images that rely on the DCT filter.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new da79e31  TIKA-3041 -- extract inline images that rely on the DCT filter.
da79e31 is described below

commit da79e3105223fe897c36d6f5d30000fc602ae3a1
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 12 11:50:34 2020 -0500

    TIKA-3041 -- extract inline images that rely on the DCT filter.
    
    # Conflicts:
    #	tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
    #	tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
---
 CHANGES.txt                                        |   1 +
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   2 +-
 .../tika/parser/pdf/ImageGraphicsEngine.java       | 423 +++++++++++++++++++++
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 198 ++--------
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  33 +-
 5 files changed, 479 insertions(+), 178 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index a002db3..800bf54 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 Release 1.24 - ???
+   * Extract inline images that rely on the DCT filter from PDFs (TIKA-3041).
 
    * Upgrade to PDFBox 2.0.18 (TIKA-3021).
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index ea3b173..8f55086 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -310,7 +310,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     }
 
     void handleCatchableIOE(IOException e) throws IOException {
-        if (config.isCatchIntermediateIOExceptions()) {
+        if (config.getCatchIntermediateIOExceptions()) {
             if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
                     e.getCause().getMessage().contains("Your document contained more than")) {
                 //TODO -- is there a cleaner way of checking for:
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
new file mode 100644
index 0000000..2625ddb
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -0,0 +1,423 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.cos.COSStream;
+import org.apache.pdfbox.filter.MissingImageReaderException;
+import org.apache.pdfbox.io.IOUtils;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDResources;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
+import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
+import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
+import org.apache.pdfbox.pdmodel.graphics.color.PDPattern;
+import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern;
+import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
+import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask;
+import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
+import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.awt.geom.Point2D;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * Copied nearly verbatim from PDFBox
+ */
+class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
+
+
+    private static final List<String> JPEG = Arrays.asList(
+            COSName.DCT_DECODE.getName(),
+            COSName.DCT_DECODE_ABBREVIATION.getName());
+
+
+    private static final List<String> JP2 =
+            Arrays.asList(COSName.JPX_DECODE.getName());
+
+    private static final List<String> JB2 = Arrays.asList(
+            COSName.JBIG2_DECODE.getName());
+
+    //TODO: parameterize this ?
+    private boolean useDirectJPEG = false;
+
+    final List<IOException> exceptions = new ArrayList<>();
+    private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
+    private final PDFParserConfig pdfParserConfig;
+    private final Map<COSStream, Integer> processedInlineImages;
+    private final AtomicInteger imageCounter;
+    private final Metadata parentMetadata;
+    private final XHTMLContentHandler xhtml;
+    private final ParseContext parseContext;
+
+    //TODO: this is an embarrassment of an initializer...fix
+    protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
+                                  PDFParserConfig pdfParserConfig, Map<COSStream, Integer> processedInlineImages,
+                                  AtomicInteger imageCounter, XHTMLContentHandler xhtml, Metadata parentMetadata,
+                                  ParseContext parseContext) {
+        super(page);
+        this.embeddedDocumentExtractor = embeddedDocumentExtractor;
+        this.pdfParserConfig = pdfParserConfig;
+        this.processedInlineImages = processedInlineImages;
+        this.imageCounter = imageCounter;
+        this.xhtml = xhtml;
+        this.parentMetadata = parentMetadata;
+        this.parseContext = parseContext;
+    }
+
+    void run() throws IOException {
+        PDPage page = getPage();
+
+        //TODO: is there a better way to do this rather than reprocessing the page
+        //can we process the text and images in one go?
+        processPage(page);
+        PDResources res = page.getResources();
+        if (res == null) {
+            return;
+        }
+
+        for (COSName name : res.getExtGStateNames()) {
+            PDSoftMask softMask = res.getExtGState(name).getSoftMask();
+
+            if (softMask != null) {
+                try {
+                    PDTransparencyGroup group = softMask.getGroup();
+
+                    if (group != null) {
+                        // PDFBOX-4327: without this line NPEs will occur
+                        res.getExtGState(name).copyIntoGraphicsState(getGraphicsState());
+
+                        processSoftMask(group);
+                    }
+                } catch (IOException e) {
+                    handleCatchableIOE(e);
+                }
+            }
+        }
+    }
+
+    @Override
+    public void drawImage(PDImage pdImage) throws IOException {
+        int imageNumber = 0;
+        if (pdImage instanceof PDImageXObject) {
+            if (pdImage.isStencil()) {
+                processColor(getGraphicsState().getNonStrokingColor());
+            }
+
+            PDImageXObject xobject = (PDImageXObject) pdImage;
+            Integer cachedNumber = processedInlineImages.get(xobject.getCOSObject());
+            if (cachedNumber != null && pdfParserConfig.getExtractUniqueInlineImagesOnly()) {
+                // skip duplicate image
+                return;
+            }
+            if (cachedNumber == null) {
+                imageNumber = imageCounter.getAndIncrement();
+                processedInlineImages.put(xobject.getCOSObject(), imageNumber);
+            }
+        } else {
+            imageNumber = imageCounter.getAndIncrement();
+        }
+        //TODO: should we use the hash of the PDImage to check for seen
+        //For now, we're relying on the cosobject, but this could lead to
+        //duplicates if the pdImage is not a PDImageXObject?
+        try {
+            processImage(pdImage, imageNumber);
+        } catch (SAXException e) {
+            throw new IOExceptionWithCause(e);
+        } catch (IOException e) {
+            handleCatchableIOE(e);
+        }
+    }
+
+    @Override
+    public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3)
+            throws IOException {
+
+    }
+
+    @Override
+    public void clip(int windingRule) throws IOException {
+
+    }
+
+    @Override
+    public void moveTo(float x, float y) throws IOException {
+
+    }
+
+    @Override
+    public void lineTo(float x, float y) throws IOException {
+
+    }
+
+    @Override
+    public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
+            throws IOException {
+
+    }
+
+    @Override
+    public Point2D getCurrentPoint() throws IOException {
+        return new Point2D.Float(0, 0);
+    }
+
+    @Override
+    public void closePath() throws IOException {
+
+    }
+
+    @Override
+    public void endPath() throws IOException {
+
+    }
+
+    @Override
+    protected void showGlyph(Matrix textRenderingMatrix,
+                             PDFont font,
+                             int code,
+                             String unicode,
+                             Vector displacement) throws IOException {
+
+        RenderingMode renderingMode = getGraphicsState().getTextState().getRenderingMode();
+        if (renderingMode.isFill()) {
+            processColor(getGraphicsState().getNonStrokingColor());
+        }
+
+        if (renderingMode.isStroke()) {
+            processColor(getGraphicsState().getStrokingColor());
+        }
+    }
+
+    @Override
+    public void strokePath() throws IOException {
+        processColor(getGraphicsState().getStrokingColor());
+    }
+
+    @Override
+    public void fillPath(int windingRule) throws IOException {
+        processColor(getGraphicsState().getNonStrokingColor());
+    }
+
+    @Override
+    public void fillAndStrokePath(int windingRule) throws IOException {
+        processColor(getGraphicsState().getNonStrokingColor());
+    }
+
+    @Override
+    public void shadingFill(COSName shadingName) throws IOException {
+
+    }
+
+    // find out if it is a tiling pattern, then process that one
+    private void processColor(PDColor color) throws IOException {
+        if (color.getColorSpace() instanceof PDPattern) {
+            PDPattern pattern = (PDPattern) color.getColorSpace();
+            PDAbstractPattern abstractPattern = pattern.getPattern(color);
+
+            if (abstractPattern instanceof PDTilingPattern) {
+                processTilingPattern((PDTilingPattern) abstractPattern, null, null);
+            }
+        }
+    }
+
+    private void processImage(PDImage image, int imageNumber) throws IOException, SAXException {
+        //this is the metadata for this particular image
+        Metadata metadata = new Metadata();
+        String suffix = getSuffix(image, metadata);
+        String fileName = "image" + imageNumber + "." + suffix;
+
+
+        AttributesImpl attr = new AttributesImpl();
+        attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
+        attr.addAttribute("", "alt", "alt", "CDATA", fileName);
+        xhtml.startElement("img", attr);
+        xhtml.endElement("img");
+
+
+        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
+        metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+
+        if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
+            ByteArrayOutputStream buffer = new ByteArrayOutputStream();
+            if (image instanceof PDImageXObject) {
+                PDMetadataExtractor.extract(((PDImageXObject) image).getMetadata(),
+                        metadata, parseContext);
+            }
+            //extract the metadata contained outside of the image
+            try {
+                writeToBuffer(image, suffix, useDirectJPEG, buffer);
+            }  catch (MissingImageReaderException e) {
+                EmbeddedDocumentUtil.recordException(e, parentMetadata);
+                return;
+            } catch (IOException e) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
+                return;
+            }
+            try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
+                embeddedDocumentExtractor.parseEmbedded(
+                        embeddedIs,
+                        new EmbeddedContentHandler(xhtml),
+                        metadata, false);
+            }
+        }
+
+    }
+
+    private String getSuffix(PDImage image, Metadata metadata) {
+        String suffix = image.getSuffix();
+
+        if (suffix == null || suffix.equals("png")) {
+            metadata.set(Metadata.CONTENT_TYPE, "image/png");
+            suffix = "png";
+        } else if (suffix.equals("jpg")) {
+            metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
+        } else if (suffix.equals("tiff")) {
+            metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
+            suffix = "tif";
+        } else if (suffix.equals("jpx")) {
+            metadata.set(Metadata.CONTENT_TYPE, "image/jp2");
+        } else if (suffix.equals("jb2")) {
+            metadata.set(
+                    Metadata.CONTENT_TYPE, "image/x-jbig2");
+        } else {
+            //TODO: determine if we need to add more image types
+//                    throw new RuntimeException("EXTEN:" + extension);
+        }
+        return suffix;
+    }
+
+    void handleCatchableIOE(IOException e) throws IOException {
+        if (pdfParserConfig.getCatchIntermediateIOExceptions()) {
+            if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
+                    e.getCause().getMessage().contains("Your document contained more than")) {
+                //TODO -- is there a cleaner way of checking for:
+                // WriteOutContentHandler.WriteLimitReachedException?
+                throw e;
+            }
+
+            String msg = e.getMessage();
+            if (msg == null) {
+                msg = "IOException, no message";
+            }
+            parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
+            exceptions.add(e);
+        } else {
+            throw e;
+        }
+    }
+
+    List<IOException> getExceptions() {
+        return exceptions;
+    }
+
+    //nearly directly copied from PDFBox ExtractImages
+    private static void writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG, OutputStream out)
+            throws IOException {
+
+        BufferedImage image = pdImage.getImage();
+        if (image != null) {
+            if ("jpg".equals(suffix)) {
+
+                String colorSpaceName = pdImage.getColorSpace().getName();
+                if (directJPEG ||
+                        !hasMasks(pdImage) &&
+                                (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
+                                        PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
+                    // RGB or Gray colorspace: get and write the unmodified JPEG stream
+                    InputStream data = pdImage.createInputStream(JPEG);
+                    IOUtils.copy(data, out);
+                    IOUtils.closeQuietly(data);
+                } else {
+                    // for CMYK and other "unusual" colorspaces, the JPEG will be converted
+                    ImageIOUtil.writeImage(image, suffix, out);
+                }
+
+            } else if ("jp2".equals(suffix)) {
+                String colorSpaceName = pdImage.getColorSpace().getName();
+                if (directJPEG ||
+                        !hasMasks(pdImage) &&
+                                (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
+                                        PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
+                    // RGB or Gray colorspace: get and write the unmodified JPEG2000 stream
+                    InputStream data = pdImage.createInputStream(JP2);
+                    IOUtils.copy(data, out);
+                    IOUtils.closeQuietly(data);
+                } else {
+                    // for CMYK and other "unusual" colorspaces, the image will be converted
+                    ImageIOUtil.writeImage(image, "jpeg2000", out);
+                }
+            } else if ("tiff".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE)) {
+                // CCITT compressed images can have a different colorspace, but this one is B/W
+                // This is a bitonal image, so copy to TYPE_BYTE_BINARY
+                // so that a G4 compressed TIFF image is created by ImageIOUtil.writeImage()
+                int w = image.getWidth();
+                int h = image.getHeight();
+                BufferedImage bitonalImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
+                // copy image the old fashioned way - ColorConvertOp is slower!
+                for (int y = 0; y < h; y++) {
+                    for (int x = 0; x < w; x++) {
+                        bitonalImage.setRGB(x, y, image.getRGB(x, y));
+                    }
+                }
+                ImageIOUtil.writeImage(bitonalImage, suffix, out);
+            } else if ("jb2".equals(suffix)) {
+                InputStream data = pdImage.createInputStream(JB2);
+                org.apache.pdfbox.io.IOUtils.copy(data, out);
+                org.apache.pdfbox.io.IOUtils.closeQuietly(data);
+            } else {
+                ImageIOUtil.writeImage(image, suffix, out);
+            }
+        }
+        out.flush();
+    }
+
+    private static boolean hasMasks(PDImage pdImage) throws IOException {
+        if (pdImage instanceof PDImageXObject) {
+            PDImageXObject ximg = (PDImageXObject) pdImage;
+            return ximg.getMask() != null || ximg.getSoftMask() != null;
+        }
+        return false;
+    }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 4551e7b..4ed0d90 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -16,47 +16,29 @@
  */
 package org.apache.tika.parser.pdf;
 
+import java.io.IOException;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
 import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.filter.MissingImageReaderException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
-import org.apache.pdfbox.pdmodel.PDResources;
-import org.apache.pdfbox.pdmodel.graphics.PDXObject;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
-import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
-import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
-import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
-import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.apache.pdfbox.util.Matrix;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.extractor.EmbeddedDocumentUtil;
-import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.EmbeddedContentHandler;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
-import java.awt.image.BufferedImage;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.Writer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
 
 /**
  * Utility class that overrides the {@link PDFTextStripper} functionality
@@ -66,16 +48,6 @@ import java.util.Set;
 class PDF2XHTML extends AbstractPDF2XHTML {
 
 
-    private static final List<String> JPEG = Arrays.asList(
-            COSName.DCT_DECODE.getName(),
-            COSName.DCT_DECODE_ABBREVIATION.getName());
-
-    private static final List<String> JP2 =
-            Arrays.asList(COSName.JPX_DECODE.getName());
-
-    private static final List<String> JB2 = Arrays.asList(
-            COSName.JBIG2_DECODE.getName());
-
     /**
      * This keeps track of the pdf object ids for inline
      * images that have been processed.
@@ -88,7 +60,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
      * TIKA-1742, we're limiting the export to one image per page.
      */
     private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
-    private int inlineImageCounter = 0;
+    private AtomicInteger inlineImageCounter = new AtomicInteger(0);
     private PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
                       PDFParserConfig config)
             throws IOException {
@@ -162,7 +134,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
         try {
             writeParagraphEnd();
             try {
-                extractImages(page.getResources(), new HashSet<COSBase>());
+                extractImages(page);
             } catch (IOException e) {
                 handleCatchableIOE(e);
             }
@@ -174,148 +146,22 @@ class PDF2XHTML extends AbstractPDF2XHTML {
         }
     }
 
-    private void extractImages(PDResources resources, Set<COSBase> seenThisPage) throws SAXException, IOException {
-        if (resources == null || config.getExtractInlineImages() == false) {
-            return;
-        }
-
-        for (COSName name : resources.getXObjectNames()) {
-
-            PDXObject object = null;
-            try {
-                object = resources.getXObject(name);
-            } catch (MissingImageReaderException e) {
-                EmbeddedDocumentUtil.recordException(e, metadata);
-                continue;
-            } catch (IOException e) {
-                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
-                continue;
-            }
-            processImageObject(object, seenThisPage);
-        }
-    }
-
-    private void processImageObject(PDXObject object, Set<COSBase> seenThisPage) throws SAXException, IOException {
-        if (object == null) {
+    private void extractImages(PDPage page) throws SAXException, IOException {
+        if (config.getExtractInlineImages() == false) {
             return;
         }
-        COSStream cosStream = object.getCOSObject();
-        if (seenThisPage.contains(cosStream)) {
-            //avoid infinite recursion TIKA-1742
-            return;
-        }
-        seenThisPage.add(cosStream);
-
-        if (object instanceof PDFormXObject) {
-            extractImages(((PDFormXObject) object).getResources(), seenThisPage);
-        } else if (object instanceof PDImageXObject) {
-
-            PDImageXObject image = (PDImageXObject) object;
-
-            Metadata embeddedMetadata = new Metadata();
-            String extension = image.getSuffix();
-
-            if (extension == null || extension.equals("png")) {
-                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/png");
-                extension = "png";
-            } else if (extension.equals("jpg")) {
-                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
-            } else if (extension.equals("tiff")) {
-                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/tiff");
-                extension = "tif";
-            } else if (extension.equals("jpx")) {
-                embeddedMetadata.set(Metadata.CONTENT_TYPE, "image/jp2");
-            } else if (extension.equals("jb2")) {
-                embeddedMetadata.set(
-                        Metadata.CONTENT_TYPE, "image/x-jbig2");
-            } else {
-                //TODO: determine if we need to add more image types
-//                    throw new RuntimeException("EXTEN:" + extension);
-                }
-                Integer imageNumber = processedInlineImages.get(cosStream);
-                if (imageNumber == null) {
-                    imageNumber = inlineImageCounter++;
-                }
-                String fileName = "image" + imageNumber + "."+extension;
-                embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
-
-            // Output the img tag
-            AttributesImpl attr = new AttributesImpl();
-            attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
-            attr.addAttribute("", "alt", "alt", "CDATA", fileName);
-            xhtml.startElement("img", attr);
-            xhtml.endElement("img");
-
-            //Do we only want to process unique COSObject ids?
-            //If so, have we already processed this one?
-            if (config.getExtractUniqueInlineImagesOnly() == true) {
-                if (processedInlineImages.containsKey(cosStream)) {
-                    return;
-                }
-                processedInlineImages.put(cosStream, imageNumber);
-            }
-
-            embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
-                    TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
 
-            if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
-                ByteArrayOutputStream buffer = new ByteArrayOutputStream();
-                try {
-                    //extract the metadata contained outside of the image
-                    PDMetadataExtractor.extract(image.getMetadata(),
-                            embeddedMetadata, context);
-                    try {
-                        writeToBuffer(image, extension, buffer);
-                    } catch (IOException e) {
-                        EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
-                        return;
-                    }
-                    try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
-                        embeddedDocumentExtractor.parseEmbedded(
-                                embeddedIs,
-                                new EmbeddedContentHandler(xhtml),
-                                embeddedMetadata, false);
-                    }
-                } catch (IOException e) {
-                    handleCatchableIOE(e);
-                }
-            }
-        }
-    }
-
-    //nearly directly copied from PDFBox ExtractImages
-    private void writeToBuffer(PDImageXObject pdImage, String suffix, OutputStream out)
-            throws IOException {
-
-        BufferedImage image = pdImage.getImage();
-        if (image != null) {
-            if ("jpg".equals(suffix)) {
-                String colorSpaceName = pdImage.getColorSpace().getName();
-                //TODO: figure out if we want directJPEG as a configuration
-                //previously: if (directJPeg || PDDeviceGray....
-                if (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
-                        PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName)) {
-                    // RGB or Gray colorspace: get and write the unmodifiedJPEG stream
-                    InputStream data = pdImage.getStream().createInputStream(JPEG);
-                    org.apache.pdfbox.io.IOUtils.copy(data, out);
-                    org.apache.pdfbox.io.IOUtils.closeQuietly(data);
-                } else {
-                    // for CMYK and other "unusual" colorspaces, the JPEG will be converted
-                    ImageIOUtil.writeImage(image, suffix, out);
-                }
-            } else if ("jp2".equals(suffix) || "jpx".equals(suffix)) {
-                InputStream data = pdImage.createInputStream(JP2);
-                org.apache.pdfbox.io.IOUtils.copy(data, out);
-                org.apache.pdfbox.io.IOUtils.closeQuietly(data);
-            } else if ("jb2".equals(suffix)) {
-                InputStream data = pdImage.createInputStream(JB2);
-                org.apache.pdfbox.io.IOUtils.copy(data, out);
-                org.apache.pdfbox.io.IOUtils.closeQuietly(data);
-            } else{
-                ImageIOUtil.writeImage(image, suffix, out);
+        ImageGraphicsEngine engine = new ImageGraphicsEngine(page, embeddedDocumentExtractor,
+                config, processedInlineImages, inlineImageCounter, xhtml, metadata, context);
+        engine.run();
+        List<IOException> engineExceptions = engine.getExceptions();
+        if (engineExceptions.size() > 0) {
+            IOException first = engineExceptions.remove(0);
+            if (config.getCatchIntermediateIOExceptions()) {
+                exceptions.addAll(engineExceptions);
             }
+            throw first;
         }
-        out.flush();
     }
 
     @Override
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8709451..fa6e962 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -31,6 +31,8 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
@@ -68,6 +70,7 @@ import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.RecursiveParserWrapperHandler;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
 
@@ -687,7 +690,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("Invalid width.", "352", metadatas.get(1).get("width"));
         
         assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
-        assertEquals("image0.jb2", 
+        assertEquals("image0.jb2",
                 metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
         assertEquals(MediaType.image("x-jbig2").toString(), 
                 metadatas.get(1).get(Metadata.CONTENT_TYPE));
@@ -1476,6 +1479,34 @@ public class PDFParserTest extends TikaTest {
         assertEquals(120, unmappedUnicodeChars[15]);
 
     }
+
+    @Test //TIKA-3041
+    @Ignore("turn back on if we add file from PDFBOX-52")
+    public void testPDFBox52() throws Exception {
+        PDFParserConfig config = new PDFParserConfig();
+        config.setExtractInlineImages(true);
+        config.setExtractUniqueInlineImagesOnly(false);
+        ParseContext context = new ParseContext();
+        context.set(PDFParserConfig.class, config);
+
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF_PDFBOX-52.pdf", context);
+        int max = 0;
+        Matcher matcher = Pattern.compile("image(\\d+)").matcher("");
+        for (Metadata m : metadataList) {
+            String n = m.get(Metadata.RESOURCE_NAME_KEY);
+
+            if (n != null && matcher.reset(n).find()) {
+                int i = Integer.parseInt(matcher.group(1));
+                if (i > max) {
+                    max = i;
+                }
+            }
+        }
+        assertEquals(37, metadataList.size());
+        assertEquals(35, max);
+    }
+
+
     /**
      * Simple class to count end of document events.  If functionality is useful,
      * move to org.apache.tika in src/test