You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/12 14:03:19 UTC

[tika] branch main updated: TIKA-3755 -- make renderresult more flexible to allow for an open container or an inputstream; add image strategy end of page

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 8ce439ce1 TIKA-3755 -- make renderresult more flexible to allow for an open container or an inputstream; add image strategy end of page
     new 7b79e0b6e Merge remote-tracking branch 'origin/main' into main
8ce439ce1 is described below

commit 8ce439ce15b2f16292cac340f5d50b16c7b3c158
Author: tallison <ta...@apache.org>
AuthorDate: Thu May 12 10:02:08 2022 -0400

    TIKA-3755 -- make renderresult more flexible to allow for an open container or an inputstream; add image strategy end of page
---
 .../org/apache/tika/renderer/RenderResult.java     | 42 +++++++++++++++++---
 .../org/apache/tika/renderer/RenderResults.java    |  8 +---
 .../org/apache/tika/renderer/RenderingTracker.java |  3 ++
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 24 ++++++------
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 45 +++++++++++++++++++++-
 .../java/org/apache/tika/parser/pdf/PDFParser.java | 29 ++++++++------
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 28 ++++++++++++--
 .../tika/renderer/pdf/mutool/MuPDFRenderer.java    |  2 +-
 .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java   | 22 ++++++++---
 .../apache/tika/parser/pdf/PDFRenderingTest.java   |  9 +++--
 .../tika/parser/pdf/tika-rendering-config.xml      | 13 ++++++-
 ...nfig.xml => tika-rendering-per-page-config.xml} | 13 +++++--
 12 files changed, 181 insertions(+), 57 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
index 888b0dd4c..3fd8d7d2c 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -16,11 +16,17 @@
  */
 package org.apache.tika.renderer;
 
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
 import java.nio.file.Path;
 
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 
-public class RenderResult {
+public class RenderResult implements Closeable {
 
     public enum STATUS {
         SUCCESS,
@@ -30,20 +36,40 @@ public class RenderResult {
     private final STATUS status;
 
     private final int id;
-    private final Path path;
+
+    private final Object result;
+
     //TODO: we're relying on metadata to bring in a bunch of info.
     //Might be cleaner to add specific parameters for page number, embedded path, etc.?
     private final Metadata metadata;
 
-    public RenderResult(STATUS status, int id, Path path, Metadata metadata) {
+    TemporaryResources tmp = new TemporaryResources();
+
+    public RenderResult(STATUS status, int id, Object result, Metadata metadata) {
         this.status = status;
         this.id = id;
-        this.path = path;
+        this.result = result;
         this.metadata = metadata;
+        if (result instanceof Path) {
+            tmp.addResource(new Closeable() {
+                @Override
+                public void close() throws IOException {
+                    Files.delete((Path)result);
+                }
+            });
+        } else if (result instanceof Closeable) {
+            tmp.addResource((Closeable) result);
+        }
     }
 
-    public Path getPath() {
-        return path;
+    public InputStream getInputStream() throws IOException {
+        if (result instanceof Path) {
+            return TikaInputStream.get((Path)result, metadata);
+        } else {
+            TikaInputStream tis = TikaInputStream.get(new byte[0]);
+            tis.setOpenContainer(result);
+            return tis;
+        }
     }
 
     public Metadata getMetadata() {
@@ -58,5 +84,9 @@ public class RenderResult {
         return id;
     }
 
+    @Override
+    public void close() throws IOException {
+        tmp.close();
+    }
 
 }
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
index 12d60d3da..108c06260 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
@@ -18,7 +18,6 @@ package org.apache.tika.renderer;
 
 import java.io.Closeable;
 import java.io.IOException;
-import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.List;
 
@@ -33,12 +32,7 @@ public class RenderResults implements Closeable {
         this.tmp = tmp;
     }
     public void add(RenderResult result) {
-        tmp.addResource(new Closeable() {
-            @Override
-            public void close() throws IOException {
-                Files.delete(result.getPath());
-            }
-        });
+        tmp.addResource(result);
         results.add(result);
     }
 
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
index 49c775e69..2e3143261 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -20,6 +20,9 @@ package org.apache.tika.renderer;
  * Use this in the ParseContext to keep track of unique ids for rendered
  * images in embedded docs. This should be used for the full parse of
  * a main document and its embedded document.
+ *
+ * This is different from RenderingState, which is used to track
+ * rendering per file/per embedded doc.
  */
 public class RenderingTracker {
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 6a46f1339..13ccd70b8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -494,13 +494,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
 
         try (TemporaryResources tmp = new TemporaryResources()) {
-            RenderResult renderResult = renderCurrentPage(pdPage, context, tmp);
-            Metadata renderMetadata = renderResult.getMetadata();
-            try (InputStream is = TikaInputStream.get(renderResult.getPath())) {
-                renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
-                        ocrImageMediaType.toString());
-                ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
-                        renderMetadata, context);
+            try (RenderResult renderResult = renderCurrentPage(pdPage, context, tmp)) {
+                Metadata renderMetadata = renderResult.getMetadata();
+                try (InputStream is = renderResult.getInputStream()) {
+                    renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+                            ocrImageMediaType.toString());
+                    ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+                            renderMetadata, context);
+                }
             }
         } catch (IOException e) {
             handleCatchableIOE(e);
@@ -538,11 +539,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             PageRangeRequest pageRangeRequest =
                     new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
             if (thisRenderer instanceof PDDocumentRenderer) {
-                try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
-                    tis.setOpenContainer(pdDocument);
-                    return thisRenderer.render(tis, pageMetadata, parseContext, pageRangeRequest)
+                //do not do autocloseable.  We need to leave the pdDocument open!
+                TikaInputStream tis = TikaInputStream.get(new byte[0]);
+                tis.setOpenContainer(pdDocument);
+                return thisRenderer.render(tis, pageMetadata, parseContext, pageRangeRequest)
                             .getResults().get(0);
-                }
+
             } else {
                 PDFRenderingState state = context.get(PDFRenderingState.class);
                 if (state == null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index fb2de2bc2..9179fe956 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -17,6 +17,7 @@
 package org.apache.tika.parser.pdf;
 
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.Writer;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -25,6 +26,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 
+import org.apache.commons.io.IOExceptionWithCause;
 import org.apache.pdfbox.cos.COSArray;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSStream;
@@ -38,9 +40,17 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.pdf.pdfbox.PDFRenderingState;
 
 /**
  * Utility class that overrides the {@link PDFTextStripper} functionality
@@ -137,6 +147,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
             writeParagraphEnd();
             try {
                 extractImages(page);
+                renderPage(page);
             } catch (IOException e) {
                 handleCatchableIOE(e);
             }
@@ -148,12 +159,44 @@ class PDF2XHTML extends AbstractPDF2XHTML {
         }
     }
 
+    private void renderPage(PDPage page) throws IOException {
+        if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_AT_PAGE_END) {
+            return;
+        }
+        PDFRenderingState state = context.get(PDFRenderingState.class);
+        //this is the document's inputstream/PDDocument
+        //TODO: figure out if we can send in the PDPage in the TikaInputStream
+        TikaInputStream tis = state.getTikaInputStream();
+        Renderer renderer = config.getRenderer();
+        RenderRequest request = new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
+        Metadata renderedMetadata = new Metadata();
+        renderedMetadata.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+        try (RenderResults results = renderer.render(tis, renderedMetadata, context, request)) {
+            for (RenderResult result : results.getResults()) {
+                if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
+                    if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
+
+                        try (InputStream is = result.getInputStream()) {
+                            //TODO: add markup here?
+                            embeddedDocumentExtractor.parseEmbedded(is, xhtml,
+                                    result.getMetadata(), true);
+                        }
+                    }
+                }
+            }
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+            handleCatchableIOE(new IOExceptionWithCause(e));
+        }
+    }
+
     void extractImages(PDPage page) throws SAXException, IOException {
         if (config.isExtractInlineImages() == false &&
                 config.isExtractInlineImageMetadataOnly() == false) {
             return;
         }
-
+        //TODO: modernize to ImageStratey != rawImages
         ImageGraphicsEngine engine =
                 config.getImageGraphicsEngineFactory().newEngine(
                         page, getCurrentPageNo(), embeddedDocumentExtractor, config,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 5ad30b37e..a4e293a58 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -114,7 +114,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
      * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
      */
     public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
-    protected static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+    public static final MediaType MEDIA_TYPE = MediaType.application("pdf");
     /**
      * Serial version UID
      */
@@ -137,7 +137,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         if (localConfig.isSetKCMS()) {
             System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
         }
-        initRenderer(localConfig);
+        initRenderer(localConfig, context);
         PDDocument pdfDocument = null;
 
         String password = "";
@@ -172,7 +172,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             AccessChecker checker = localConfig.getAccessChecker();
             checker.check(metadata);
             tstream.setOpenContainer(pdfDocument);
-            handleRendering(pdfDocument, tstream, handler, metadata, context, localConfig);
+            renderPagesBeforeParse(tstream, handler, metadata, context, localConfig);
             if (handler != null) {
                 if (shouldHandleXFAOnly(hasXFA, localConfig)) {
                     handleXFAOnly(pdfDocument, handler, metadata, context);
@@ -209,9 +209,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
     }
 
     private boolean shouldSpool(PDFParserConfig localConfig) {
-        if (localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+        if (localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_BEFORE_PARSE
+                || localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_AT_PAGE_END) {
             return true;
         }
+
         if (localConfig.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.NO_OCR) {
             return false;
         }
@@ -219,11 +221,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         return true;
     }
 
-    private void handleRendering(PDDocument pdDocument, TikaInputStream tstream,
-                                 ContentHandler xhtml, Metadata parentMetadata,
-                                 ParseContext context,
-                                 PDFParserConfig config) {
-        if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+    private void renderPagesBeforeParse(TikaInputStream tstream,
+                                        ContentHandler xhtml, Metadata parentMetadata,
+                                        ParseContext context,
+                                        PDFParserConfig config) {
+        if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_BEFORE_PARSE) {
             return;
         }
         RenderResults renderResults = null;
@@ -242,7 +244,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         for (RenderResult result : renderResults.getResults()) {
             if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
                 if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
-                    try (InputStream is = TikaInputStream.get(result.getPath())) {
+                    try (InputStream is = result.getInputStream()) {
                         embeddedDocumentExtractor.parseEmbedded(is, xhtml, result.getMetadata(),
                                 false);
                     } catch (SecurityException e) {
@@ -720,8 +722,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         //no-op
     }
 
-    private void initRenderer(PDFParserConfig config) {
-        if (config.getRenderer() != null) {
+    private void initRenderer(PDFParserConfig config, ParseContext context) {
+
+        if (config.getRenderer() != null &&
+                config.getRenderer().getSupportedTypes(context).contains(MEDIA_TYPE)) {
             return;
         }
         //set a default renderer if nothing was defined
@@ -742,6 +746,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         defaultConfig.setImageGraphicsEngineFactory(imageGraphicsEngineFactory);
     }
 
+    @Field
     public void setImageStrategy(String imageStrategy) {
         defaultConfig.setImageStrategy(imageStrategy);
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d17f7ed88..da07b2b12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -987,15 +987,35 @@ public class PDFParserConfig implements Serializable {
     }
 
     public enum IMAGE_STRATEGY {
-        NONE, RAW_IMAGES, RENDERED_PAGES;//TODO: add LOGICAL_IMAGES
+        NONE,
+        /**
+         * This is the more modern version of {@link PDFParserConfig#extractInlineImages}
+         */
+        RAW_IMAGES,
+        /**
+         * If you want the rendered images, and you don't care that there's
+         * markup in the xhtml handler per page then go with this option.
+         * For some rendering engines, it is faster to render the full document
+         * upfront than to parse a page, render a page, etc.
+         */
+        RENDER_PAGES_BEFORE_PARSE,
+        /**
+         * This renders each page, one at a time, at the end of the page.
+         * For some rendering engines, this may be slower, but it allows the writing
+         * of image metadata into the xhtml in the proper location
+         */
+        RENDER_PAGES_AT_PAGE_END;
+        //TODO: add LOGICAL_IMAGES
 
         private static IMAGE_STRATEGY parse(String s) {
             String lc = s.toLowerCase(Locale.US);
             switch (lc) {
-                case "rawImages" :
+                case "rawimages" :
                     return RAW_IMAGES;
-                case "renderedPages":
-                    return RENDERED_PAGES;
+                case "renderpagesbeforeparse":
+                    return RENDER_PAGES_BEFORE_PARSE;
+                case "renderpagesatpageend":
+                    return RENDER_PAGES_AT_PAGE_END;
                 case "none":
                     return NONE;
                 default:
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
index dcf00279b..d108ba54b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
@@ -115,7 +115,7 @@ public class MuPDFRenderer implements Renderer {
                 renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                         TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
                 results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(),
-                        f.toPath(), renderMetadata));
+                        TikaInputStream.get(f.toPath()), renderMetadata));
             }
         }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index 4313a1084..347c72e00 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -31,6 +31,8 @@ import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.InitializableProblemHandler;
@@ -47,6 +49,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.metadata.TikaPagedText;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
 import org.apache.tika.renderer.PageBasedRenderResults;
 import org.apache.tika.renderer.PageRangeRequest;
 import org.apache.tika.renderer.RenderRequest;
@@ -56,7 +59,9 @@ import org.apache.tika.renderer.RenderingTracker;
 
 public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
 
-    Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+    Set<MediaType> SUPPORTED_TYPES = Collections.singleton(PDFParser.MEDIA_TYPE);
+
+    private static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);
 
     /**
      * This is the amount of time it takes for PDFBox to render the page
@@ -138,8 +143,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
             try {
                 m.set(TikaPagedText.PAGE_NUMBER, i);
                 m.set(TikaPagedText.PAGE_ROTATION, (double)pdDocument.getPage(i - 1).getRotation());
-                Path imagePath = renderPage(renderer, id, i, m);
-                results.add(new RenderResult(RenderResult.STATUS.SUCCESS, id, imagePath, m));
+                results.add(renderPage(renderer, id, i, m));
             } catch (IOException e) {
                 EmbeddedDocumentUtil.recordException(e, m);
                 results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
@@ -148,7 +152,8 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
     }
 
 
-    private Path renderPage(PDFRenderer renderer, int id, int pageNumber, Metadata metadata)
+    protected RenderResult renderPage(PDFRenderer renderer, int id, int pageNumber,
+                                     Metadata metadata)
             throws IOException {
 
         Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
@@ -169,10 +174,15 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
         } catch (SecurityException e) {
             //throw SecurityExceptions immediately
             throw e;
-        } catch (IOException | RuntimeException e) {
+        } catch (Exception e) {
+            try {
+                Files.delete(tmpFile);
+            } catch (IOException ex) {
+                LOG.warn("couldn't delete " + tmpFile, ex);
+            }
             throw new IOExceptionWithCause(e);
         }
-        return tmpFile;
+        return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, metadata);
     }
 
     @Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index e604a8179..08d18b6c1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -46,6 +46,11 @@ import org.apache.tika.parser.Parser;
 
 public class PDFRenderingTest extends TikaTest {
 
+    @Test
+    public void testDefault() throws Exception {
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf");
+        assertEquals(1, metadataList.size());
+    }
 
     @Test
     public void testBasic() throws Exception {
@@ -56,7 +61,6 @@ public class PDFRenderingTest extends TikaTest {
         Map<Integer, byte[]> embedded =
                 ((RenderCaptureExtractor)parseContext.get(EmbeddedDocumentExtractor.class))
                         .getEmbedded();
-
         assertEquals(1, embedded.size());
         assertTrue(embedded.containsKey(0));
         //what else can we do to test this?  File type == tiff? Run OCR?
@@ -99,9 +103,6 @@ public class PDFRenderingTest extends TikaTest {
     private ParseContext configureParseContext() {
         ParseContext parseContext = new ParseContext();
         parseContext.set(EmbeddedDocumentExtractor.class, new RenderCaptureExtractor(parseContext));
-        PDFParserConfig config = new PDFParserConfig();
-        config.setImageStrategy(PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES);
-        parseContext.set(PDFParserConfig.class, config);
         return parseContext;
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
index 80a9a4c73..92f351bb9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -17,9 +17,18 @@
 -->
 <properties>
     <parsers>
-        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="imageStrategy" type="string">renderPagesBeforeParse</param>
+            </params>
+        </parser>
     </parsers>
+<!--
+    This will be supplied automatically if not specified.
     <renderers>
         <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
-    </renderers>
+    </renderers> -->
 </properties>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
similarity index 70%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
index 80a9a4c73..e3f92df6c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
@@ -17,9 +17,16 @@
 -->
 <properties>
     <parsers>
-        <parser class="org.apache.tika.parser.DefaultParser"/>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="imageStrategy" type="string">renderPagesAtPageEnd</param>
+            </params>
+        </parser>
     </parsers>
-    <renderers>
+<!--    <renderers>
         <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
-    </renderers>
+    </renderers> -->
 </properties>