You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/03 17:56:28 UTC

[tika] 02/02: TIKA-3571 -- clean build, add mutool renderer

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3571
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6804d53cdf7c0a55a67175c6d0cc6b48e080692c
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 3 13:55:59 2022 -0400

    TIKA-3571 -- clean build, add mutool renderer
---
 .../java/org/apache/tika/config/TikaConfig.java    |   9 ++
 .../apache/tika/renderer/CompositeRenderer.java    |   3 +
 .../tika/renderer/PageBasedRenderResults.java      |  50 +++++++
 .../org/apache/tika/renderer/RenderRequest.java    |   6 +-
 .../org/apache/tika/renderer/RenderingState.java   |  16 +++
 .../org/apache/tika/renderer/RenderingTracker.java |  16 +++
 .../tika-parser-pdf-module/pom.xml                 |   2 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 140 ++++++++++++-------
 .../java/org/apache/tika/parser/pdf/OCR2XHTML.java |   2 -
 .../java/org/apache/tika/parser/pdf/PDF2XHTML.java |   2 -
 .../tika/parser/pdf/PDFMarkedContent2XHTML.java    |   1 -
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  19 ++-
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 120 ++++++++---------
 .../tika/parser/pdf/TextOnlyPDFRenderer.java       | 106 +++++++++++++++
 .../apache/tika/renderer/pdf/MuPDFRenderer.java    | 149 +++++++++++++++++++++
 .../tika/renderer/pdf/PDDocumentRenderer.java      |  16 +++
 .../apache/tika/renderer/pdf/PDFBoxRenderer.java   |   8 +-
 .../tika/renderer/pdf/PDFRenderingState.java       |  16 +++
 .../apache/tika/parser/pdf/PDFRenderingTest.java   |   6 +-
 .../tika/parser/pdf/tika-rendering-config.xml      |  10 --
 .../apache/tika/parser/crypto/TSDParserTest.java   |   8 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  27 ++++
 .../configs/tika-rendering-mupdf-config.xml}       |  12 +-
 23 files changed, 588 insertions(+), 156 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index d16f6f171..e7c212f87 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -895,6 +895,15 @@ public class TikaConfig {
             MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
 
             // Try the possible default and composite parser constructors
+            if (parser == null) {
+                try {
+                    c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
+                            Collection.class, EncodingDetector.class, Renderer.class);
+                    parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer);
+                } catch (NoSuchMethodException me) {
+                    //swallow
+                }
+            }
             if (parser == null) {
                 try {
                     c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
index b5fb2acbb..a98d39c97 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
@@ -78,6 +78,9 @@ public class CompositeRenderer implements Renderer, Initializable {
         return renderer.render(is, metadata, parseContext, requests);
     }
 
+    public Renderer getLeafRenderer(MediaType mt) {
+        return rendererMap.get(mt);
+    }
     @Override
     public void initialize(Map<String, Param> params) throws TikaConfigException {
 
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
new file mode 100644
index 000000000..0c238b60d
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Rendering;
+
+public class PageBasedRenderResults extends RenderResults {
+
+    Map<Integer, List<RenderResult>> results = new HashMap<>();
+
+    public PageBasedRenderResults(TemporaryResources tmp) {
+        super(tmp);
+    }
+    public void add(RenderResult result) {
+        Integer page = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
+        if (page != null) {
+            List<RenderResult> pageResults = results.get(page);
+            if (pageResults == null) {
+                pageResults = new ArrayList<>();
+                results.put(page, pageResults);
+            }
+            pageResults.add(result);
+        }
+        super.add(result);
+    }
+
+    public List<RenderResult> getPage(int pageNumber) {
+        return results.get(pageNumber);
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
index 4e1f2f3cf..3277d866a 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
@@ -17,7 +17,11 @@
 package org.apache.tika.renderer;
 
 /**
- * Empty interface for requests to a renderer.
+ * Empty interface for requests to a renderer. Different
+ * file formats and different use cases will have different types of requests.
+ * For page based, it could be a page range (render the full pages from 2 to 5);
+ * or it could be a single page with an x-y bounding box.  For video files,
+ * it could be a temporal offset or a temporal offset with an x-y bounding box.
  */
 public interface RenderRequest {
 }
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
index 1b3baf44e..ed8250065 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.renderer;
 
 /**
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
index 992b86f28..49c775e69 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.renderer;
 
 /**
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
index 108a0d423..be05f67b1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
@@ -78,7 +78,7 @@
     <dependency>
       <groupId>com.github.jai-imageio</groupId>
       <artifactId>jai-imageio-core</artifactId>
-      <version>1.3.1</version>
+      <version>${imageio.version}</version>
       <scope>test</scope>
     </dependency>
   </dependencies>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index cb40569ff..344756dd0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -92,7 +92,6 @@ import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.apache.pdfbox.util.Matrix;
 import org.apache.pdfbox.util.Vector;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
@@ -110,10 +109,13 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.PageBasedRenderResults;
 import org.apache.tika.renderer.PageRangeRequest;
 import org.apache.tika.renderer.RenderResult;
-import org.apache.tika.renderer.RenderResults;
-import org.apache.tika.renderer.RenderingState;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.renderer.pdf.PDDocumentRenderer;
 import org.apache.tika.renderer.pdf.PDFRenderingState;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
@@ -299,9 +301,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     private void parseMetadata(InputStream stream, Metadata embeddedMetadata)
             throws IOException, SAXException {
         try {
-            embeddedDocumentExtractor
-                    .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
-                            true);
+            embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+                    embeddedMetadata, true);
         } catch (IOException e) {
             handleCatchableIOE(e);
         }
@@ -324,8 +325,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
 
     }
 
-    private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
-            PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+    private void extractFilesfromEFTree(PDNameTreeNode efTree,
+                                        Map<String, PDComplexFileSpecification> embeddedFileNames,
+                                        int depth) throws IOException {
         if (depth > MAX_RECURSION_DEPTH) {
             throw new IOException("Hit max recursion depth");
         }
@@ -440,9 +442,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         xhtml.endElement("div");
 
         try {
-            embeddedDocumentExtractor
-                    .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
-                            false);
+            embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+                    embeddedMetadata, false);
         } finally {
             IOUtils.closeQuietly(stream);
         }
@@ -512,22 +513,41 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             noContextRenderCurrentPage(parseContext, tmpResources);
         }
         //if the full document has already been rendered, then reuse that file
-        RenderResults results = renderingState.getRenderResults();
+        //TODO: we need to prevent this if only a portion of the page or portions
+        //of the page have been rendered.
+        //TODO: we should also figure out how to not reuse the rendering if
+        //the user wants to render twice (say, full color to display to users, but
+        //grayscale for (notionally?) better OCR).
+        PageBasedRenderResults results = (PageBasedRenderResults) renderingState.getRenderResults();
         if (results != null) {
-            for (RenderResult result : results.getResults()) {
-                int pageNo = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
-                if (getCurrentPageNo() == pageNo) {
-                    return result;
-                }
+            List<RenderResult> pageResults = results.getPage(getCurrentPageNo());
+            if (pageResults.size() == 1) {
+                return pageResults.get(0);
             }
         }
-        //use the regular renderer if it isn't "no_text"
-        if (config.getOcrRenderingStrategy() != PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT) {
+        Renderer thisRenderer = getPDFRenderer(config.getRenderer());
+        //if there's a configured renderer and if the rendering strategy is "all"
+        if (thisRenderer != null &&
+                config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.ALL) {
             PageRangeRequest pageRangeRequest =
                     new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
-            try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
-                tis.setOpenContainer(pdDocument);
-                return config.getRenderer().render(tis, metadata, parseContext, pageRangeRequest)
+            if (thisRenderer instanceof PDDocumentRenderer) {
+                try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+                    Metadata m = new Metadata();
+                    m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+                    tis.setOpenContainer(pdDocument);
+                    return thisRenderer.render(tis, m, parseContext, pageRangeRequest)
+                            .getResults().get(0);
+                }
+            } else {
+                Metadata m = new Metadata();
+                m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+                PDFRenderingState state = context.get(PDFRenderingState.class);
+                if (state == null) {
+                    throw new IllegalArgumentException("RenderingState must not be null");
+                }
+                return thisRenderer
+                        .render(state.getTikaInputStream(), m, parseContext, pageRangeRequest)
                         .getResults().get(0);
             }
         } else {
@@ -535,16 +555,47 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         }
     }
 
+    private Renderer getPDFRenderer(Renderer renderer) {
+        if (renderer == null) {
+            return renderer;
+        }
+        if (renderer instanceof CompositeRenderer) {
+            return ((CompositeRenderer)renderer).getLeafRenderer(PDFParser.MEDIA_TYPE);
+        } else if (renderer.getSupportedTypes(context).contains(PDFParser.MEDIA_TYPE)) {
+            return renderer;
+        }
+        return null;
+    }
+
 
     private RenderResult noContextRenderCurrentPage(ParseContext parseContext,
-                                           TemporaryResources tmpResources)
+                                                    TemporaryResources tmpResources)
             throws IOException, TikaException {
-        PDFRenderer renderer =
-                config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT ?
-                        new NoTextPDFRenderer(pdDocument) : new PDFRenderer(pdDocument);
+        PDFRenderer renderer = null;
+        switch (config.getOcrRenderingStrategy()) {
+            case NO_TEXT:
+                renderer = new NoTextPDFRenderer(pdDocument);
+                break;
+            case TEXT_ONLY:
+                renderer = new TextOnlyPDFRenderer(pdDocument);
+                break;
+            case ALL:
+                renderer = new PDFRenderer(pdDocument);
+                break;
+        }
 
         int dpi = config.getOcrDPI();
         Path tmpFile = null;
+        Metadata m = new Metadata();
+        m.set(Rendering.PAGE_NUMBER, pageIndex + 1);
+
+        RenderingTracker renderingTracker = parseContext.get(RenderingTracker.class);
+        if (renderingTracker == null) {
+            renderingTracker = new RenderingTracker();
+            parseContext.set(RenderingTracker.class, renderingTracker);
+        }
+        int id = renderingTracker.getNextId();
+
         try {
             BufferedImage image =
                     renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
@@ -563,9 +614,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             //need to have a wide catch
             metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
                     ExceptionUtils.getStackTrace(e));
-            return null;
+
+            return new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m);
         }
-        return null;
+        return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, m);
     }
 
     @Override
@@ -593,7 +645,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     if (fann.getFile() instanceof PDComplexFileSpecification) {
                         handlePDComplexFileSpec(fann.getAttachmentName(),
                                 "annotationFileAttachment",
-                                (PDComplexFileSpecification)fann.getFile());
+                                (PDComplexFileSpecification) fann.getFile());
                     }
                 } else if (annotation instanceof PDAnnotationWidget) {
                     handleWidget((PDAnnotationWidget) annotation);
@@ -606,11 +658,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                         //subtype is U3D or PRC or model/ (prefix for model mime type)
                         metadata.set(PDF.HAS_3D, true);
                     }
-                    for (COSDictionary fileSpec :
-                            findFileSpecs(annotation.getCOSObject())) {
+                    for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) {
                         PDComplexFileSpecification cfs = new PDComplexFileSpecification(fileSpec);
-                        handlePDComplexFileSpec(cfs.getFilename(),
-                                annotationSubtype, cfs);
+                        handlePDComplexFileSpec(cfs.getFilename(), annotationSubtype, cfs);
                     }
                 }
                 // TODO: remove once PDFBOX-1143 is fixed:
@@ -665,13 +715,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                 boolean unmappedExceedsLimit = false;
                 if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
                     // There are enough characters to not have to do OCR.  Check number of unmapped characters
-                    final float percentUnmapped = (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
-                    final float unmappedCharacterLimit = config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
-                    unmappedExceedsLimit = (unmappedCharacterLimit < 1)
-                            ? percentUnmapped > unmappedCharacterLimit
-                            : unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
+                    final float percentUnmapped =
+                            (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
+                    final float unmappedCharacterLimit =
+                            config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
+                    unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
+                            percentUnmapped > unmappedCharacterLimit :
+                            unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
                 }
-                if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() || unmappedExceedsLimit) {
+                if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
+                        unmappedExceedsLimit) {
                     doOCROnCurrentPage(AUTO);
                 }
             }
@@ -710,14 +763,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
     }
 
-    private void handlePDComplexFileSpec(String attachmentName,
-                                         String annotationType,
+    private void handlePDComplexFileSpec(String attachmentName, String annotationType,
                                          PDComplexFileSpecification fileSpec) throws IOException {
         try {
             AttributesImpl attributes = new AttributesImpl();
             attributes.addAttribute("", "source", "source", "CDATA", annotationType);
-            extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec,
-                    attributes);
+            extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec, attributes);
         } catch (SAXException e) {
             throw new IOException("file embedded in annotation sax exception", e);
         } catch (TikaException e) {
@@ -1176,8 +1227,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
     }
 
     enum ActionTrigger {
-        AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS,
-        ANNOTATION_CURSOR_EXIT,
+        AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, ANNOTATION_CURSOR_EXIT,
         ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, ANNOTATION_MOUSE_RELEASED,
         ANNOTATION_PAGE_CLOSED, ANNOTATION_PAGE_NO_LONGER_VISIBLE, ANNOTATION_PAGE_OPENED,
         ANNOTATION_PAGE_VISIBLE, ANNOTATION_RECEIVES_FOCUS, ANNOTATION_WIDGET,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 534eea324..2658a484a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -23,13 +23,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.renderer.RenderResults;
 import org.apache.tika.sax.XHTMLContentHandler;
 
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index e493ea3ff..602a8823e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -34,13 +34,11 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
 import org.apache.pdfbox.util.Matrix;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.renderer.RenderResults;
 import org.apache.tika.sax.XHTMLContentHandler;
 
 /**
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 5c5ec6c03..3e4e1bf64 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -41,7 +41,6 @@ import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructur
 import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
 import org.apache.pdfbox.text.PDFMarkedContentExtractor;
 import org.apache.pdfbox.text.TextPosition;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.exception.TikaException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index a16381437..28f796157 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -70,7 +70,6 @@ import org.apache.tika.renderer.PageRangeRequest;
 import org.apache.tika.renderer.RenderResult;
 import org.apache.tika.renderer.RenderResults;
 import org.apache.tika.renderer.Renderer;
-import org.apache.tika.renderer.pdf.PDDocumentRenderer;
 import org.apache.tika.renderer.pdf.PDFBoxRenderer;
 import org.apache.tika.renderer.pdf.PDFRenderingState;
 import org.apache.tika.sax.XHTMLContentHandler;
@@ -114,7 +113,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
      * @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
      */
     public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
-    private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+    protected static final MediaType MEDIA_TYPE = MediaType.application("pdf");
     /**
      * Serial version UID
      */
@@ -193,10 +192,17 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
             metadata.set(PDF.IS_ENCRYPTED, "true");
             throw new EncryptedDocumentException(e);
         } finally {
-            //replace the one that was here
-            context.set(PDFRenderingState.class, incomingRenderingState);
-            if (pdfDocument != null) {
-                pdfDocument.close();
+            PDFRenderingState currState = context.get(PDFRenderingState.class);
+            try {
+                if (currState != null && currState.getRenderResults() != null) {
+                    currState.getRenderResults().close();
+                }
+                if (pdfDocument != null) {
+                    pdfDocument.close();
+                }
+            } finally {
+                //replace the one that was here
+                context.set(PDFRenderingState.class, incomingRenderingState);
             }
         }
     }
@@ -219,7 +225,6 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
         if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
             return;
         }
-        Renderer renderer = config.getRenderer();
         RenderResults renderResults = null;
         try {
             renderResults = renderPDF(tstream, context, config);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 72cc0e050..fb8a315ae 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -21,6 +21,7 @@ import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;
 import java.util.HashSet;
 import java.util.Locale;
+import java.util.Objects;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -798,70 +799,51 @@ public class PDFParserConfig implements Serializable {
         if (this == o) {
             return true;
         }
-        if (!(o instanceof PDFParserConfig)) {
+        if (o == null || getClass() != o.getClass()) {
             return false;
         }
-
         PDFParserConfig config = (PDFParserConfig) o;
+        return enableAutoSpace == config.enableAutoSpace &&
+                suppressDuplicateOverlappingText == config.suppressDuplicateOverlappingText &&
+                extractAnnotationText == config.extractAnnotationText &&
+                sortByPosition == config.sortByPosition &&
+                extractAcroFormContent == config.extractAcroFormContent &&
+                extractBookmarksText == config.extractBookmarksText &&
+                extractInlineImages == config.extractInlineImages &&
+                extractInlineImageMetadataOnly == config.extractInlineImageMetadataOnly &&
+                extractUniqueInlineImagesOnly == config.extractUniqueInlineImagesOnly &&
+                extractMarkedContent == config.extractMarkedContent &&
+                Float.compare(config.dropThreshold, dropThreshold) == 0 &&
+                ifXFAExtractOnlyXFA == config.ifXFAExtractOnlyXFA && ocrDPI == config.ocrDPI &&
+                Float.compare(config.ocrImageQuality, ocrImageQuality) == 0 &&
+                catchIntermediateIOExceptions == config.catchIntermediateIOExceptions &&
+                extractActions == config.extractActions &&
+                extractFontNames == config.extractFontNames &&
+                maxMainMemoryBytes == config.maxMainMemoryBytes && setKCMS == config.setKCMS &&
+                detectAngles == config.detectAngles &&
+                Objects.equals(userConfigured, config.userConfigured) &&
+                Objects.equals(averageCharTolerance, config.averageCharTolerance) &&
+                Objects.equals(spacingTolerance, config.spacingTolerance) &&
+                ocrStrategy == config.ocrStrategy &&
+                Objects.equals(ocrStrategyAuto, config.ocrStrategyAuto) &&
+                ocrRenderingStrategy == config.ocrRenderingStrategy &&
+                ocrImageType == config.ocrImageType &&
+                Objects.equals(ocrImageFormatName, config.ocrImageFormatName) &&
+                imageStrategy == config.imageStrategy &&
+                Objects.equals(accessChecker, config.accessChecker) &&
+                Objects.equals(renderer, config.renderer);
+    }
 
-        if (isEnableAutoSpace() != config.isEnableAutoSpace()) {
-            return false;
-        }
-        if (isSuppressDuplicateOverlappingText() != config.isSuppressDuplicateOverlappingText()) {
-            return false;
-        }
-        if (isExtractAnnotationText() != config.isExtractAnnotationText()) {
-            return false;
-        }
-        if (isSortByPosition() != config.isSortByPosition()) {
-            return false;
-        }
-        if (isExtractAcroFormContent() != config.isExtractAcroFormContent()) {
-            return false;
-        }
-        if (isExtractBookmarksText() != config.isExtractBookmarksText()) {
-            return false;
-        }
-        if (isExtractInlineImages() != config.isExtractInlineImages()) {
-            return false;
-        }
-        if (isExtractUniqueInlineImagesOnly() != config.isExtractUniqueInlineImagesOnly()) {
-            return false;
-        }
-        if (isIfXFAExtractOnlyXFA() != config.isIfXFAExtractOnlyXFA()) {
-            return false;
-        }
-        if (getOcrDPI() != config.getOcrDPI()) {
-            return false;
-        }
-        if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) {
-            return false;
-        }
-        if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) {
-            return false;
-        }
-        if (!getSpacingTolerance().equals(config.getSpacingTolerance())) {
-            return false;
-        }
-        if (!getDropThreshold().equals(config.getDropThreshold())) {
-            return false;
-        }
-        if (!getOcrStrategy().equals(config.getOcrStrategy())) {
-            return false;
-        }
-        if (getOcrImageType() != config.getOcrImageType()) {
-            return false;
-        }
-        if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) {
-            return false;
-        }
-        if (isExtractActions() != config.isExtractActions()) {
-            return false;
-        }
-        if (!getAccessChecker().equals(config.getAccessChecker())) {
-            return false;
-        }
-        return getMaxMainMemoryBytes() == config.getMaxMainMemoryBytes();
+    @Override
+    public int hashCode() {
+        return Objects.hash(userConfigured, enableAutoSpace, suppressDuplicateOverlappingText,
+                extractAnnotationText, sortByPosition, extractAcroFormContent, extractBookmarksText,
+                extractInlineImages, extractInlineImageMetadataOnly, extractUniqueInlineImagesOnly,
+                extractMarkedContent, averageCharTolerance, spacingTolerance, dropThreshold,
+                ifXFAExtractOnlyXFA, ocrStrategy, ocrStrategyAuto, ocrRenderingStrategy, ocrDPI,
+                ocrImageType, ocrImageFormatName, ocrImageQuality, imageStrategy, accessChecker,
+                catchIntermediateIOExceptions, extractActions, extractFontNames, maxMainMemoryBytes,
+                setKCMS, detectAngles, renderer);
     }
 
     public void setRenderer(Renderer renderer) {
@@ -948,18 +930,22 @@ public class PDFParserConfig implements Serializable {
     }
 
     public enum OCR_RENDERING_STRATEGY {
-        NO_TEXT, ALL; //AUTO?
-        // TODO: TEXT_ONLY be useful in instances where the unicode mappings are
-        //  corrupt/non-existent
+        NO_TEXT, TEXT_ONLY, ALL; //AUTO?
 
         private static OCR_RENDERING_STRATEGY parse(String s) {
             if (s == null) {
-                return NO_TEXT;
-            } else if ("no_text".equals(s.toLowerCase(Locale.ROOT))) {
-                return NO_TEXT;
-            } else if ("all".equals(s.toLowerCase(Locale.ROOT))) {
                 return ALL;
             }
+            String lc = s.toLowerCase(Locale.US);
+            switch (lc) {
+                case "text_only":
+                    return TEXT_ONLY;
+                case "no_text":
+                    return NO_TEXT;
+                case "all":
+                    return ALL;
+            }
+
             StringBuilder sb = new StringBuilder();
             sb.append("I regret that I don't recognize '").append(s);
             sb.append("' as an OCR_STRATEGY. I only recognize:");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
new file mode 100644
index 000000000..f282d124c
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.Graphics2D;
+import java.awt.geom.Point2D;
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.rendering.PageDrawer;
+import org.apache.pdfbox.rendering.PageDrawerParameters;
+
+/**
+ * This class extends the PDFRenderer to render only the textual
+ * elements
+ */
+public class TextOnlyPDFRenderer extends PDFRenderer {
+
+    public TextOnlyPDFRenderer(PDDocument document) {
+        super(document);
+    }
+
+    /**
+     * Returns a new PageDrawer instance, using the given parameters. May be overridden.
+     */
+    protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException {
+        PageDrawer pageDrawer = new TextOnlyPageDrawer(parameters);
+        pageDrawer.setAnnotationFilter(getAnnotationsFilter());
+        return pageDrawer;
+    }
+
+    private class TextOnlyPageDrawer extends PageDrawer {
+        public TextOnlyPageDrawer(PageDrawerParameters parameters) throws IOException {
+            super(parameters);
+        }
+
+        @Override
+        protected void transferClip(Graphics2D graphics) {
+
+        }
+
+        @Override
+        public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
+
+        }
+
+        @Override
+        public void strokePath() throws IOException {
+
+        }
+
+        @Override
+        public void fillPath(int windingRule) throws IOException {
+        }
+
+        @Override
+        public void fillAndStrokePath(int windingRule) throws IOException {
+        }
+
+        @Override
+        public void clip(int windingRule) {
+        }
+
+        @Override
+        public void lineTo(float x, float y) {
+        }
+
+        @Override
+        public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+        }
+
+        @Override
+        public void closePath() {
+        }
+
+        @Override
+        public void endPath() {
+        }
+
+        @Override
+        public void drawImage(PDImage pdImage) throws IOException {
+
+        }
+
+        @Override
+        public void shadingFill(COSName shadingName) throws IOException {
+        }
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
new file mode 100644
index 000000000..983934677
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+public class MuPDFRenderer implements Renderer {
+
+    Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+                                RenderRequest... requests) throws IOException, TikaException {
+        TemporaryResources tmp = new TemporaryResources();
+        PageBasedRenderResults results = new PageBasedRenderResults(tmp);
+        Path path = TikaInputStream.get(is, tmp).getPath();
+        for (RenderRequest request : requests) {
+            renderRequest(path, metadata, parseContext, request, results, tmp);
+        }
+        return results;
+    }
+
+    private RenderResults renderRequest(Path pdf, Metadata metadata, ParseContext parseContext,
+                                        RenderRequest request, RenderResults results,
+                                        TemporaryResources tmp) throws TikaException, IOException {
+        if (! (request instanceof PageRangeRequest)) {
+            throw new TikaException("I regret that this renderer can only handle " +
+                    "PageRangeRequests, not " + request.getClass());
+        }
+        PageRangeRequest rangeRequest = (PageRangeRequest)request;
+        RenderingTracker tracker = parseContext.get(RenderingTracker.class);
+        if (tracker == null) {
+            tracker = new RenderingTracker();
+            parseContext.set(RenderingTracker.class, tracker);
+        }
+
+        Path dir = Files.createTempDirectory("tika-render-");
+        //TODO -- this assumes files have been deleted first
+        //do something smarter
+        tmp.addResource(new Closeable() {
+            @Override
+            public void close() throws IOException {
+                Files.delete(dir);
+            }
+        });
+        //TODO -- run mutool pages to get page sizes
+        //and then use that information in the -O to get proper scaling
+        //etc.
+        // This would also allow us to run on a single page at a time if that's of any interest
+        String[] args = createCommandLine(pdf, dir, rangeRequest);
+
+        ProcessBuilder builder = new ProcessBuilder();
+        builder.command(args);
+        //TODO: parameterize timeout
+        FileProcessResult result = ProcessUtils.execute(builder, 60000, 10, 1000);
+        if (result.getExitValue() != 0) {
+            throw new TikaException(result.getStderr());
+        }
+        //TODO -- fix this
+        Matcher m = Pattern.compile("tika-mutool-render-(\\d+)\\.png").matcher("");
+        for (File f : dir.toFile().listFiles()) {
+            String n = f.getName();
+            if (m.reset(n).find()) {
+                int pageIndex = Integer.parseInt(m.group(1));
+                Metadata renderMetadata = new Metadata();
+                renderMetadata.set(Rendering.PAGE_NUMBER, pageIndex);
+                renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+                        TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
+                results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(),
+                        f.toPath(), renderMetadata));
+            }
+        }
+
+        return results;
+    }
+
+    private String[] createCommandLine(Path pdf, Path dir, PageRangeRequest request) {
+        //TODO parameterize all the things; mutool path, colorspace and size and format and...
+        List<String> args = new ArrayList<>();
+        args.add("mutool");
+        args.add("convert");
+        args.add("-O colorspace=gray");
+        args.add("-o");
+        args.add(
+                ProcessUtils.escapeCommandLine(
+                        dir.toAbsolutePath().toString() + "/" + "tika-mutool-render-%d.png"));
+        args.add(ProcessUtils.escapeCommandLine(pdf.toAbsolutePath().toString()));
+        if (request != PageRangeRequest.RENDER_ALL) {
+            StringBuilder sb = new StringBuilder();
+            int cnt = 0;
+            for (int i = request.getFrom(); i <= request.getTo(); i++) {
+                if (cnt++ > 0) {
+                    sb.append(",");
+                }
+                sb.append(i);
+            }
+            args.add(sb.toString());
+        }
+        return args.toArray(new String[0]);
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
index 2c19d57c2..7cecd9a23 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.renderer.pdf;
 
 import org.apache.tika.renderer.Renderer;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
index 31e5a9047..e5c5d8973 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
@@ -46,11 +46,11 @@ import org.apache.tika.metadata.Rendering;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
 import org.apache.tika.renderer.PageRangeRequest;
 import org.apache.tika.renderer.RenderRequest;
 import org.apache.tika.renderer.RenderResult;
 import org.apache.tika.renderer.RenderResults;
-import org.apache.tika.renderer.Renderer;
 import org.apache.tika.renderer.RenderingTracker;
 
 public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
@@ -95,7 +95,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
             pdDocument = PDDocument.load(is);
             mustClose = true;
         }
-        RenderResults results = new RenderResults(new TemporaryResources());
+        PageBasedRenderResults results = new PageBasedRenderResults(new TemporaryResources());
         try {
             for (RenderRequest renderRequest : requests) {
                 processRequest(renderRequest, pdDocument, metadata, parseContext, results);
@@ -110,7 +110,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
 
     private void processRequest(RenderRequest renderRequest, PDDocument pdDocument,
                                 Metadata metadata, ParseContext parseContext,
-                                RenderResults results) {
+                                PageBasedRenderResults results) {
         if (renderRequest == PageRangeRequest.RENDER_ALL || renderRequest.equals(PageRangeRequest.RENDER_ALL)) {
             renderRange(pdDocument, 1, pdDocument.getNumberOfPages(),
                     metadata, parseContext, results);
@@ -122,7 +122,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
     }
 
     private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
-                                    ParseContext parseContext, RenderResults results) {
+                                    ParseContext parseContext, PageBasedRenderResults results) {
         PDFRenderer renderer = new PDFRenderer(pdDocument);
         RenderingTracker tracker = parseContext.get(RenderingTracker.class);
         if (tracker == null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
index 2de00115b..51ea0ae96 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package org.apache.tika.renderer.pdf;
 
 import org.apache.tika.io.TikaInputStream;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 326d625e7..ed61e2a02 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -23,10 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
-import java.nio.file.OpenOption;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardOpenOption;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -63,6 +59,8 @@ public class PDFRenderingTest extends TikaTest {
 
         assertEquals(1, embedded.size());
         assertTrue(embedded.containsKey(0));
+        //what else can we do to test this?  File type == tiff? Run OCR?
+        assertTrue(embedded.get(0).length > 1000);
 
         assertEquals(2, metadataList.size());
         Metadata tiffMetadata = metadataList.get(1);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
index 13f946781..5b1351662 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -18,16 +18,6 @@
 <properties>
     <parsers>
         <parser class="org.apache.tika.parser.DefaultParser"/>
-        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
-            <params>
-                <param name="maxFileSizeToOcr" type="long">100</param>
-            </params>
-        </parser>
-        <parser class="org.apache.tika.parser.pdf.PDFParser">
-            <params>
-                <param name="extractInlineImages" type="bool">false</param>
-            </params>
-        </parser>
     </parsers>
     <renderers>
         <renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 111825101..00cccbdc9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -26,14 +26,20 @@ import org.junit.jupiter.api.Test;
 import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
 
 public class TSDParserTest extends TikaTest {
 
     @Test
     public void testBrokenPdf() throws Exception {
+        ParseContext parseContext = new ParseContext();
+        PDFParserConfig config = new PDFParserConfig();
+        config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+        parseContext.set(PDFParserConfig.class, config);
         //make sure that embedded file appears in list
         //and make sure embedded exception is recorded
-        List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd");
+        List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd", parseContext);
         assertEquals(2, list.size());
         assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
         assertNotNull(list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8c8d4d068..bac5fc6a9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -46,6 +46,7 @@ import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.parser.ocr.TesseractOCRConfig;
 import org.apache.tika.parser.ocr.TesseractOCRParser;
 import org.apache.tika.parser.xml.XMLProfiler;
@@ -62,6 +63,8 @@ public class PDFParserTest extends TikaTest {
     public static Level PDFBOX_LOG_LEVEL = Level.INFO;
     private static Boolean hasTesseract = null;
 
+    private static Boolean hasMuPDF = null;
+
     public static boolean canRunOCR() throws TikaConfigException {
         if (hasTesseract != null) {
             return hasTesseract;
@@ -70,6 +73,14 @@ public class PDFParserTest extends TikaTest {
         return hasTesseract;
     }
 
+    public static boolean hasMuPDF() throws TikaConfigException {
+        if (hasMuPDF != null) {
+            return hasMuPDF;
+        }
+        hasMuPDF = ExternalParser.check(new String[]{"mutool", "-v"});
+        return hasMuPDF;
+    }
+
     @BeforeAll
     public static void setup() {
         //remember default logging level, but turn off for PDFParserTest
@@ -440,4 +451,20 @@ public class PDFParserTest extends TikaTest {
         }
     }
 
+    @Test
+    public void testMuPDFInOCR() throws Exception {
+        //TODO -- need to add "rendered by" to confirm that mutool was actually called
+        //and that there wasn't some backoff to PDFBox the PDFParser
+        assumeTrue(canRunOCR(), "can run OCR");
+        assumeTrue(hasMuPDF(), "has mupdf");
+        try (InputStream is = getResourceAsStream(
+                "/configs/tika-rendering-mupdf-config.xml")) {
+            assertNotNull(is);
+            TikaConfig tikaConfig = new TikaConfig(is);
+            Parser p = new AutoDetectParser(tikaConfig);
+            String text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p);
+            assertContains("Happy", text.trim());
+        }
+    }
+
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
similarity index 67%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
index 13f946781..1034d05eb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
@@ -18,18 +18,8 @@
 <properties>
     <parsers>
         <parser class="org.apache.tika.parser.DefaultParser"/>
-        <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
-            <params>
-                <param name="maxFileSizeToOcr" type="long">100</param>
-            </params>
-        </parser>
-        <parser class="org.apache.tika.parser.pdf.PDFParser">
-            <params>
-                <param name="extractInlineImages" type="bool">false</param>
-            </params>
-        </parser>
     </parsers>
     <renderers>
-        <renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
+        <renderer class="org.apache.tika.renderer.pdf.MuPDFRenderer"/>
     </renderers>
 </properties>