You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/03 17:56:28 UTC
[tika] 02/02: TIKA-3571 -- clean build, add mutool renderer
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3571
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6804d53cdf7c0a55a67175c6d0cc6b48e080692c
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 3 13:55:59 2022 -0400
TIKA-3571 -- clean build, add mutool renderer
---
.../java/org/apache/tika/config/TikaConfig.java | 9 ++
.../apache/tika/renderer/CompositeRenderer.java | 3 +
.../tika/renderer/PageBasedRenderResults.java | 50 +++++++
.../org/apache/tika/renderer/RenderRequest.java | 6 +-
.../org/apache/tika/renderer/RenderingState.java | 16 +++
.../org/apache/tika/renderer/RenderingTracker.java | 16 +++
.../tika-parser-pdf-module/pom.xml | 2 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 140 ++++++++++++-------
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 2 -
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 2 -
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 1 -
.../java/org/apache/tika/parser/pdf/PDFParser.java | 19 ++-
.../apache/tika/parser/pdf/PDFParserConfig.java | 120 ++++++++---------
.../tika/parser/pdf/TextOnlyPDFRenderer.java | 106 +++++++++++++++
.../apache/tika/renderer/pdf/MuPDFRenderer.java | 149 +++++++++++++++++++++
.../tika/renderer/pdf/PDDocumentRenderer.java | 16 +++
.../apache/tika/renderer/pdf/PDFBoxRenderer.java | 8 +-
.../tika/renderer/pdf/PDFRenderingState.java | 16 +++
.../apache/tika/parser/pdf/PDFRenderingTest.java | 6 +-
.../tika/parser/pdf/tika-rendering-config.xml | 10 --
.../apache/tika/parser/crypto/TSDParserTest.java | 8 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 27 ++++
.../configs/tika-rendering-mupdf-config.xml} | 12 +-
23 files changed, 588 insertions(+), 156 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index d16f6f171..e7c212f87 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -895,6 +895,15 @@ public class TikaConfig {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
// Try the possible default and composite parser constructors
+ if (parser == null) {
+ try {
+ c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
+ Collection.class, EncodingDetector.class, Renderer.class);
+ parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer);
+ } catch (NoSuchMethodException me) {
+ //swallow
+ }
+ }
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
index b5fb2acbb..a98d39c97 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
@@ -78,6 +78,9 @@ public class CompositeRenderer implements Renderer, Initializable {
return renderer.render(is, metadata, parseContext, requests);
}
+ public Renderer getLeafRenderer(MediaType mt) {
+ return rendererMap.get(mt);
+ }
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
new file mode 100644
index 000000000..0c238b60d
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Rendering;
+
+public class PageBasedRenderResults extends RenderResults {
+
+ Map<Integer, List<RenderResult>> results = new HashMap<>();
+
+ public PageBasedRenderResults(TemporaryResources tmp) {
+ super(tmp);
+ }
+ public void add(RenderResult result) {
+ Integer page = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
+ if (page != null) {
+ List<RenderResult> pageResults = results.get(page);
+ if (pageResults == null) {
+ pageResults = new ArrayList<>();
+ results.put(page, pageResults);
+ }
+ pageResults.add(result);
+ }
+ super.add(result);
+ }
+
+ public List<RenderResult> getPage(int pageNumber) {
+ return results.get(pageNumber);
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
index 4e1f2f3cf..3277d866a 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
@@ -17,7 +17,11 @@
package org.apache.tika.renderer;
/**
- * Empty interface for requests to a renderer.
+ * Empty interface for requests to a renderer. Different
+ * file formats and different use cases will have different types of requests.
+ * For page based, it could be a page range (render the full pages from 2 to 5);
+ * or it could be a single page with an x-y bounding box. For video files,
+ * it could be a temporal offset or a temporal offset with an x-y bounding box.
*/
public interface RenderRequest {
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
index 1b3baf44e..ed8250065 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.renderer;
/**
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
index 992b86f28..49c775e69 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.renderer;
/**
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
index 108a0d423..be05f67b1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
@@ -78,7 +78,7 @@
<dependency>
<groupId>com.github.jai-imageio</groupId>
<artifactId>jai-imageio-core</artifactId>
- <version>1.3.1</version>
+ <version>${imageio.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index cb40569ff..344756dd0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -92,7 +92,6 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -110,10 +109,13 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.CompositeRenderer;
+import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
-import org.apache.tika.renderer.RenderResults;
-import org.apache.tika.renderer.RenderingState;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.renderer.pdf.PDDocumentRenderer;
import org.apache.tika.renderer.pdf.PDFRenderingState;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
@@ -299,9 +301,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
private void parseMetadata(InputStream stream, Metadata embeddedMetadata)
throws IOException, SAXException {
try {
- embeddedDocumentExtractor
- .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
- true);
+ embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, true);
} catch (IOException e) {
handleCatchableIOE(e);
}
@@ -324,8 +325,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
- private void extractFilesfromEFTree(PDNameTreeNode efTree, Map<String,
- PDComplexFileSpecification> embeddedFileNames, int depth) throws IOException {
+ private void extractFilesfromEFTree(PDNameTreeNode efTree,
+ Map<String, PDComplexFileSpecification> embeddedFileNames,
+ int depth) throws IOException {
if (depth > MAX_RECURSION_DEPTH) {
throw new IOException("Hit max recursion depth");
}
@@ -440,9 +442,8 @@ class AbstractPDF2XHTML extends PDFTextStripper {
xhtml.endElement("div");
try {
- embeddedDocumentExtractor
- .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
- false);
+ embeddedDocumentExtractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
+ embeddedMetadata, false);
} finally {
IOUtils.closeQuietly(stream);
}
@@ -512,22 +513,41 @@ class AbstractPDF2XHTML extends PDFTextStripper {
noContextRenderCurrentPage(parseContext, tmpResources);
}
//if the full document has already been rendered, then reuse that file
- RenderResults results = renderingState.getRenderResults();
+ //TODO: we need to prevent this if only a portion of the page or portions
+ //of the page have been rendered.
+ //TODO: we should also figure out how to not reuse the rendering if
+ //the user wants to render twice (say, full color to display to users, but
+ //grayscale for (notionally?) better OCR).
+ PageBasedRenderResults results = (PageBasedRenderResults) renderingState.getRenderResults();
if (results != null) {
- for (RenderResult result : results.getResults()) {
- int pageNo = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
- if (getCurrentPageNo() == pageNo) {
- return result;
- }
+ List<RenderResult> pageResults = results.getPage(getCurrentPageNo());
+ if (pageResults.size() == 1) {
+ return pageResults.get(0);
}
}
- //use the regular renderer if it isn't "no_text"
- if (config.getOcrRenderingStrategy() != PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT) {
+ Renderer thisRenderer = getPDFRenderer(config.getRenderer());
+ //if there's a configured renderer and if the rendering strategy is "all"
+ if (thisRenderer != null &&
+ config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.ALL) {
PageRangeRequest pageRangeRequest =
new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
- try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
- tis.setOpenContainer(pdDocument);
- return config.getRenderer().render(tis, metadata, parseContext, pageRangeRequest)
+ if (thisRenderer instanceof PDDocumentRenderer) {
+ try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+ tis.setOpenContainer(pdDocument);
+ return thisRenderer.render(tis, m, parseContext, pageRangeRequest)
+ .getResults().get(0);
+ }
+ } else {
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+ PDFRenderingState state = context.get(PDFRenderingState.class);
+ if (state == null) {
+ throw new IllegalArgumentException("RenderingState must not be null");
+ }
+ return thisRenderer
+ .render(state.getTikaInputStream(), m, parseContext, pageRangeRequest)
.getResults().get(0);
}
} else {
@@ -535,16 +555,47 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
+ private Renderer getPDFRenderer(Renderer renderer) {
+ if (renderer == null) {
+ return renderer;
+ }
+ if (renderer instanceof CompositeRenderer) {
+ return ((CompositeRenderer)renderer).getLeafRenderer(PDFParser.MEDIA_TYPE);
+ } else if (renderer.getSupportedTypes(context).contains(PDFParser.MEDIA_TYPE)) {
+ return renderer;
+ }
+ return null;
+ }
+
private RenderResult noContextRenderCurrentPage(ParseContext parseContext,
- TemporaryResources tmpResources)
+ TemporaryResources tmpResources)
throws IOException, TikaException {
- PDFRenderer renderer =
- config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT ?
- new NoTextPDFRenderer(pdDocument) : new PDFRenderer(pdDocument);
+ PDFRenderer renderer = null;
+ switch (config.getOcrRenderingStrategy()) {
+ case NO_TEXT:
+ renderer = new NoTextPDFRenderer(pdDocument);
+ break;
+ case TEXT_ONLY:
+ renderer = new TextOnlyPDFRenderer(pdDocument);
+ break;
+ case ALL:
+ renderer = new PDFRenderer(pdDocument);
+ break;
+ }
int dpi = config.getOcrDPI();
Path tmpFile = null;
+ Metadata m = new Metadata();
+ m.set(Rendering.PAGE_NUMBER, pageIndex + 1);
+
+ RenderingTracker renderingTracker = parseContext.get(RenderingTracker.class);
+ if (renderingTracker == null) {
+ renderingTracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, renderingTracker);
+ }
+ int id = renderingTracker.getNextId();
+
try {
BufferedImage image =
renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
@@ -563,9 +614,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//need to have a wide catch
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
ExceptionUtils.getStackTrace(e));
- return null;
+
+ return new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m);
}
- return null;
+ return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, m);
}
@Override
@@ -593,7 +645,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
if (fann.getFile() instanceof PDComplexFileSpecification) {
handlePDComplexFileSpec(fann.getAttachmentName(),
"annotationFileAttachment",
- (PDComplexFileSpecification)fann.getFile());
+ (PDComplexFileSpecification) fann.getFile());
}
} else if (annotation instanceof PDAnnotationWidget) {
handleWidget((PDAnnotationWidget) annotation);
@@ -606,11 +658,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
//subtype is U3D or PRC or model/ (prefix for model mime type)
metadata.set(PDF.HAS_3D, true);
}
- for (COSDictionary fileSpec :
- findFileSpecs(annotation.getCOSObject())) {
+ for (COSDictionary fileSpec : findFileSpecs(annotation.getCOSObject())) {
PDComplexFileSpecification cfs = new PDComplexFileSpecification(fileSpec);
- handlePDComplexFileSpec(cfs.getFilename(),
- annotationSubtype, cfs);
+ handlePDComplexFileSpec(cfs.getFilename(), annotationSubtype, cfs);
}
}
// TODO: remove once PDFBOX-1143 is fixed:
@@ -665,13 +715,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
boolean unmappedExceedsLimit = false;
if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
// There are enough characters to not have to do OCR. Check number of unmapped characters
- final float percentUnmapped = (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
- final float unmappedCharacterLimit = config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
- unmappedExceedsLimit = (unmappedCharacterLimit < 1)
- ? percentUnmapped > unmappedCharacterLimit
- : unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
+ final float percentUnmapped =
+ (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
+ final float unmappedCharacterLimit =
+ config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
+ unmappedExceedsLimit = (unmappedCharacterLimit < 1) ?
+ percentUnmapped > unmappedCharacterLimit :
+ unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
}
- if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() || unmappedExceedsLimit) {
+ if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
+ unmappedExceedsLimit) {
doOCROnCurrentPage(AUTO);
}
}
@@ -710,14 +763,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return PDFDOMUtil.findType(cosDict, types, MAX_RECURSION_DEPTH);
}
- private void handlePDComplexFileSpec(String attachmentName,
- String annotationType,
+ private void handlePDComplexFileSpec(String attachmentName, String annotationType,
PDComplexFileSpecification fileSpec) throws IOException {
try {
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "source", "source", "CDATA", annotationType);
- extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec,
- attributes);
+ extractMultiOSPDEmbeddedFiles(attachmentName, fileSpec, attributes);
} catch (SAXException e) {
throw new IOException("file embedded in annotation sax exception", e);
} catch (TikaException e) {
@@ -1176,8 +1227,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
enum ActionTrigger {
- AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS,
- ANNOTATION_CURSOR_EXIT,
+ AFTER_DOCUMENT_PRINT, AFTER_DOCUMENT_SAVE, ANNOTATION_CURSOR_ENTERS, ANNOTATION_CURSOR_EXIT,
ANNOTATION_LOSE_INPUT_FOCUS, ANNOTATION_MOUSE_CLICK, ANNOTATION_MOUSE_RELEASED,
ANNOTATION_PAGE_CLOSED, ANNOTATION_PAGE_NO_LONGER_VISIBLE, ANNOTATION_PAGE_OPENED,
ANNOTATION_PAGE_VISIBLE, ANNOTATION_RECEIVES_FOCUS, ANNOTATION_WIDGET,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 534eea324..2658a484a 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -23,13 +23,11 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.renderer.RenderResults;
import org.apache.tika.sax.XHTMLContentHandler;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index e493ea3ff..602a8823e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -34,13 +34,11 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.renderer.RenderResults;
import org.apache.tika.sax.XHTMLContentHandler;
/**
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 5c5ec6c03..3e4e1bf64 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -41,7 +41,6 @@ import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructur
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.apache.pdfbox.text.TextPosition;
-import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index a16381437..28f796157 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -70,7 +70,6 @@ import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
import org.apache.tika.renderer.Renderer;
-import org.apache.tika.renderer.pdf.PDDocumentRenderer;
import org.apache.tika.renderer.pdf.PDFBoxRenderer;
import org.apache.tika.renderer.pdf.PDFRenderingState;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -114,7 +113,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
* @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
*/
public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
- private static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+ protected static final MediaType MEDIA_TYPE = MediaType.application("pdf");
/**
* Serial version UID
*/
@@ -193,10 +192,17 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
- //replace the one that was here
- context.set(PDFRenderingState.class, incomingRenderingState);
- if (pdfDocument != null) {
- pdfDocument.close();
+ PDFRenderingState currState = context.get(PDFRenderingState.class);
+ try {
+ if (currState != null && currState.getRenderResults() != null) {
+ currState.getRenderResults().close();
+ }
+ if (pdfDocument != null) {
+ pdfDocument.close();
+ }
+ } finally {
+ //replace the one that was here
+ context.set(PDFRenderingState.class, incomingRenderingState);
}
}
}
@@ -219,7 +225,6 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
return;
}
- Renderer renderer = config.getRenderer();
RenderResults renderResults = null;
try {
renderResults = renderPDF(tstream, context, config);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 72cc0e050..fb8a315ae 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -21,6 +21,7 @@ import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.util.HashSet;
import java.util.Locale;
+import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -798,70 +799,51 @@ public class PDFParserConfig implements Serializable {
if (this == o) {
return true;
}
- if (!(o instanceof PDFParserConfig)) {
+ if (o == null || getClass() != o.getClass()) {
return false;
}
-
PDFParserConfig config = (PDFParserConfig) o;
+ return enableAutoSpace == config.enableAutoSpace &&
+ suppressDuplicateOverlappingText == config.suppressDuplicateOverlappingText &&
+ extractAnnotationText == config.extractAnnotationText &&
+ sortByPosition == config.sortByPosition &&
+ extractAcroFormContent == config.extractAcroFormContent &&
+ extractBookmarksText == config.extractBookmarksText &&
+ extractInlineImages == config.extractInlineImages &&
+ extractInlineImageMetadataOnly == config.extractInlineImageMetadataOnly &&
+ extractUniqueInlineImagesOnly == config.extractUniqueInlineImagesOnly &&
+ extractMarkedContent == config.extractMarkedContent &&
+ Float.compare(config.dropThreshold, dropThreshold) == 0 &&
+ ifXFAExtractOnlyXFA == config.ifXFAExtractOnlyXFA && ocrDPI == config.ocrDPI &&
+ Float.compare(config.ocrImageQuality, ocrImageQuality) == 0 &&
+ catchIntermediateIOExceptions == config.catchIntermediateIOExceptions &&
+ extractActions == config.extractActions &&
+ extractFontNames == config.extractFontNames &&
+ maxMainMemoryBytes == config.maxMainMemoryBytes && setKCMS == config.setKCMS &&
+ detectAngles == config.detectAngles &&
+ Objects.equals(userConfigured, config.userConfigured) &&
+ Objects.equals(averageCharTolerance, config.averageCharTolerance) &&
+ Objects.equals(spacingTolerance, config.spacingTolerance) &&
+ ocrStrategy == config.ocrStrategy &&
+ Objects.equals(ocrStrategyAuto, config.ocrStrategyAuto) &&
+ ocrRenderingStrategy == config.ocrRenderingStrategy &&
+ ocrImageType == config.ocrImageType &&
+ Objects.equals(ocrImageFormatName, config.ocrImageFormatName) &&
+ imageStrategy == config.imageStrategy &&
+ Objects.equals(accessChecker, config.accessChecker) &&
+ Objects.equals(renderer, config.renderer);
+ }
- if (isEnableAutoSpace() != config.isEnableAutoSpace()) {
- return false;
- }
- if (isSuppressDuplicateOverlappingText() != config.isSuppressDuplicateOverlappingText()) {
- return false;
- }
- if (isExtractAnnotationText() != config.isExtractAnnotationText()) {
- return false;
- }
- if (isSortByPosition() != config.isSortByPosition()) {
- return false;
- }
- if (isExtractAcroFormContent() != config.isExtractAcroFormContent()) {
- return false;
- }
- if (isExtractBookmarksText() != config.isExtractBookmarksText()) {
- return false;
- }
- if (isExtractInlineImages() != config.isExtractInlineImages()) {
- return false;
- }
- if (isExtractUniqueInlineImagesOnly() != config.isExtractUniqueInlineImagesOnly()) {
- return false;
- }
- if (isIfXFAExtractOnlyXFA() != config.isIfXFAExtractOnlyXFA()) {
- return false;
- }
- if (getOcrDPI() != config.getOcrDPI()) {
- return false;
- }
- if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) {
- return false;
- }
- if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) {
- return false;
- }
- if (!getSpacingTolerance().equals(config.getSpacingTolerance())) {
- return false;
- }
- if (!getDropThreshold().equals(config.getDropThreshold())) {
- return false;
- }
- if (!getOcrStrategy().equals(config.getOcrStrategy())) {
- return false;
- }
- if (getOcrImageType() != config.getOcrImageType()) {
- return false;
- }
- if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) {
- return false;
- }
- if (isExtractActions() != config.isExtractActions()) {
- return false;
- }
- if (!getAccessChecker().equals(config.getAccessChecker())) {
- return false;
- }
- return getMaxMainMemoryBytes() == config.getMaxMainMemoryBytes();
+ @Override
+ public int hashCode() {
+ return Objects.hash(userConfigured, enableAutoSpace, suppressDuplicateOverlappingText,
+ extractAnnotationText, sortByPosition, extractAcroFormContent, extractBookmarksText,
+ extractInlineImages, extractInlineImageMetadataOnly, extractUniqueInlineImagesOnly,
+ extractMarkedContent, averageCharTolerance, spacingTolerance, dropThreshold,
+ ifXFAExtractOnlyXFA, ocrStrategy, ocrStrategyAuto, ocrRenderingStrategy, ocrDPI,
+ ocrImageType, ocrImageFormatName, ocrImageQuality, imageStrategy, accessChecker,
+ catchIntermediateIOExceptions, extractActions, extractFontNames, maxMainMemoryBytes,
+ setKCMS, detectAngles, renderer);
}
public void setRenderer(Renderer renderer) {
@@ -948,18 +930,22 @@ public class PDFParserConfig implements Serializable {
}
public enum OCR_RENDERING_STRATEGY {
- NO_TEXT, ALL; //AUTO?
- // TODO: TEXT_ONLY be useful in instances where the unicode mappings are
- // corrupt/non-existent
+ NO_TEXT, TEXT_ONLY, ALL; //AUTO?
private static OCR_RENDERING_STRATEGY parse(String s) {
if (s == null) {
- return NO_TEXT;
- } else if ("no_text".equals(s.toLowerCase(Locale.ROOT))) {
- return NO_TEXT;
- } else if ("all".equals(s.toLowerCase(Locale.ROOT))) {
return ALL;
}
+ String lc = s.toLowerCase(Locale.US);
+ switch (lc) {
+ case "text_only":
+ return TEXT_ONLY;
+ case "no_text":
+ return NO_TEXT;
+ case "all":
+ return ALL;
+ }
+
StringBuilder sb = new StringBuilder();
sb.append("I regret that I don't recognize '").append(s);
sb.append("' as an OCR_STRATEGY. I only recognize:");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
new file mode 100644
index 000000000..f282d124c
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.Graphics2D;
+import java.awt.geom.Point2D;
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.rendering.PageDrawer;
+import org.apache.pdfbox.rendering.PageDrawerParameters;
+
+/**
+ * This class extends the PDFRenderer to render only the textual
+ * elements
+ */
+public class TextOnlyPDFRenderer extends PDFRenderer {
+
+ public TextOnlyPDFRenderer(PDDocument document) {
+ super(document);
+ }
+
+ /**
+ * Returns a new PageDrawer instance, using the given parameters. May be overridden.
+ */
+ protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException {
+ PageDrawer pageDrawer = new TextOnlyPageDrawer(parameters);
+ pageDrawer.setAnnotationFilter(getAnnotationsFilter());
+ return pageDrawer;
+ }
+
+ private class TextOnlyPageDrawer extends PageDrawer {
+ public TextOnlyPageDrawer(PageDrawerParameters parameters) throws IOException {
+ super(parameters);
+ }
+
+ @Override
+ protected void transferClip(Graphics2D graphics) {
+
+ }
+
+ @Override
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
+
+ }
+
+ @Override
+ public void strokePath() throws IOException {
+
+ }
+
+ @Override
+ public void fillPath(int windingRule) throws IOException {
+ }
+
+ @Override
+ public void fillAndStrokePath(int windingRule) throws IOException {
+ }
+
+ @Override
+ public void clip(int windingRule) {
+ }
+
+ @Override
+ public void lineTo(float x, float y) {
+ }
+
+ @Override
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+ }
+
+ @Override
+ public void closePath() {
+ }
+
+ @Override
+ public void endPath() {
+ }
+
+ @Override
+ public void drawImage(PDImage pdImage) throws IOException {
+
+ }
+
+ @Override
+ public void shadingFill(COSName shadingName) throws IOException {
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
new file mode 100644
index 000000000..983934677
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.renderer.pdf;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+public class MuPDFRenderer implements Renderer {
+
+ Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest... requests) throws IOException, TikaException {
+ TemporaryResources tmp = new TemporaryResources();
+ PageBasedRenderResults results = new PageBasedRenderResults(tmp);
+ Path path = TikaInputStream.get(is, tmp).getPath();
+ for (RenderRequest request : requests) {
+ renderRequest(path, metadata, parseContext, request, results, tmp);
+ }
+ return results;
+ }
+
+ private RenderResults renderRequest(Path pdf, Metadata metadata, ParseContext parseContext,
+ RenderRequest request, RenderResults results,
+ TemporaryResources tmp) throws TikaException, IOException {
+ if (! (request instanceof PageRangeRequest)) {
+ throw new TikaException("I regret that this renderer can only handle " +
+ "PageRangeRequests, not " + request.getClass());
+ }
+ PageRangeRequest rangeRequest = (PageRangeRequest)request;
+ RenderingTracker tracker = parseContext.get(RenderingTracker.class);
+ if (tracker == null) {
+ tracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, tracker);
+ }
+
+ Path dir = Files.createTempDirectory("tika-render-");
+ //TODO -- this assumes files have been deleted first
+ //do something smarter
+ tmp.addResource(new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete(dir);
+ }
+ });
+ //TODO -- run mutool pages to get page sizes
+ //and then use that information in the -O to get proper scaling
+ //etc.
+ // This would also allow us to run on a single page at a time if that's of any interest
+ String[] args = createCommandLine(pdf, dir, rangeRequest);
+
+ ProcessBuilder builder = new ProcessBuilder();
+ builder.command(args);
+ //TODO: parameterize timeout
+ FileProcessResult result = ProcessUtils.execute(builder, 60000, 10, 1000);
+ if (result.getExitValue() != 0) {
+ throw new TikaException(result.getStderr());
+ }
+ //TODO -- fix this
+ Matcher m = Pattern.compile("tika-mutool-render-(\\d+)\\.png").matcher("");
+ for (File f : dir.toFile().listFiles()) {
+ String n = f.getName();
+ if (m.reset(n).find()) {
+ int pageIndex = Integer.parseInt(m.group(1));
+ Metadata renderMetadata = new Metadata();
+ renderMetadata.set(Rendering.PAGE_NUMBER, pageIndex);
+ renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
+ results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(),
+ f.toPath(), renderMetadata));
+ }
+ }
+
+ return results;
+ }
+
+ private String[] createCommandLine(Path pdf, Path dir, PageRangeRequest request) {
+ //TODO parameterize all the things; mutool path, colorspace and size and format and...
+ List<String> args = new ArrayList<>();
+ args.add("mutool");
+ args.add("convert");
+ args.add("-O colorspace=gray");
+ args.add("-o");
+ args.add(
+ ProcessUtils.escapeCommandLine(
+ dir.toAbsolutePath().toString() + "/" + "tika-mutool-render-%d.png"));
+ args.add(ProcessUtils.escapeCommandLine(pdf.toAbsolutePath().toString()));
+ if (request != PageRangeRequest.RENDER_ALL) {
+ StringBuilder sb = new StringBuilder();
+ int cnt = 0;
+ for (int i = request.getFrom(); i <= request.getTo(); i++) {
+ if (cnt++ > 0) {
+ sb.append(",");
+ }
+ sb.append(i);
+ }
+ args.add(sb.toString());
+ }
+ return args.toArray(new String[0]);
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
index 2c19d57c2..7cecd9a23 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.renderer.pdf;
import org.apache.tika.renderer.Renderer;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
index 31e5a9047..e5c5d8973 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
@@ -46,11 +46,11 @@ import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
-import org.apache.tika.renderer.Renderer;
import org.apache.tika.renderer.RenderingTracker;
public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
@@ -95,7 +95,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
pdDocument = PDDocument.load(is);
mustClose = true;
}
- RenderResults results = new RenderResults(new TemporaryResources());
+ PageBasedRenderResults results = new PageBasedRenderResults(new TemporaryResources());
try {
for (RenderRequest renderRequest : requests) {
processRequest(renderRequest, pdDocument, metadata, parseContext, results);
@@ -110,7 +110,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
private void processRequest(RenderRequest renderRequest, PDDocument pdDocument,
Metadata metadata, ParseContext parseContext,
- RenderResults results) {
+ PageBasedRenderResults results) {
if (renderRequest == PageRangeRequest.RENDER_ALL || renderRequest.equals(PageRangeRequest.RENDER_ALL)) {
renderRange(pdDocument, 1, pdDocument.getNumberOfPages(),
metadata, parseContext, results);
@@ -122,7 +122,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
}
private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
- ParseContext parseContext, RenderResults results) {
+ ParseContext parseContext, PageBasedRenderResults results) {
PDFRenderer renderer = new PDFRenderer(pdDocument);
RenderingTracker tracker = parseContext.get(RenderingTracker.class);
if (tracker == null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
index 2de00115b..51ea0ae96 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
package org.apache.tika.renderer.pdf;
import org.apache.tika.io.TikaInputStream;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index 326d625e7..ed61e2a02 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -23,10 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
-import java.nio.file.OpenOption;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.nio.file.StandardOpenOption;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -63,6 +59,8 @@ public class PDFRenderingTest extends TikaTest {
assertEquals(1, embedded.size());
assertTrue(embedded.containsKey(0));
+ //what else can we do to test this? File type == tiff? Run OCR?
+ assertTrue(embedded.get(0).length > 1000);
assertEquals(2, metadataList.size());
Metadata tiffMetadata = metadataList.get(1);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
index 13f946781..5b1351662 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -18,16 +18,6 @@
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"/>
- <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
- <params>
- <param name="maxFileSizeToOcr" type="long">100</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractInlineImages" type="bool">false</param>
- </params>
- </parser>
</parsers>
<renderers>
<renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index 111825101..00cccbdc9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -26,14 +26,20 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParserConfig;
public class TSDParserTest extends TikaTest {
@Test
public void testBrokenPdf() throws Exception {
+ ParseContext parseContext = new ParseContext();
+ PDFParserConfig config = new PDFParserConfig();
+ config.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+ parseContext.set(PDFParserConfig.class, config);
//make sure that embedded file appears in list
//and make sure embedded exception is recorded
- List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd");
+ List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd", parseContext);
assertEquals(2, list.size());
assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
assertNotNull(list.get(1).get(TikaCoreProperties.EMBEDDED_EXCEPTION));
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 8c8d4d068..bac5fc6a9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -46,6 +46,7 @@ import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.ocr.TesseractOCRConfig;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.parser.xml.XMLProfiler;
@@ -62,6 +63,8 @@ public class PDFParserTest extends TikaTest {
public static Level PDFBOX_LOG_LEVEL = Level.INFO;
private static Boolean hasTesseract = null;
+ private static Boolean hasMuPDF = null;
+
public static boolean canRunOCR() throws TikaConfigException {
if (hasTesseract != null) {
return hasTesseract;
@@ -70,6 +73,14 @@ public class PDFParserTest extends TikaTest {
return hasTesseract;
}
+ public static boolean hasMuPDF() throws TikaConfigException {
+ if (hasMuPDF != null) {
+ return hasMuPDF;
+ }
+ hasMuPDF = ExternalParser.check(new String[]{"mutool", "-v"});
+ return hasMuPDF;
+ }
+
@BeforeAll
public static void setup() {
//remember default logging level, but turn off for PDFParserTest
@@ -440,4 +451,20 @@ public class PDFParserTest extends TikaTest {
}
}
+ @Test
+ public void testMuPDFInOCR() throws Exception {
+ //TODO -- need to add "rendered by" to confirm that mutool was actually called
+ //and that there wasn't some backoff to PDFBox the PDFParser
+ assumeTrue(canRunOCR(), "can run OCR");
+ assumeTrue(hasMuPDF(), "has mupdf");
+ try (InputStream is = getResourceAsStream(
+ "/configs/tika-rendering-mupdf-config.xml")) {
+ assertNotNull(is);
+ TikaConfig tikaConfig = new TikaConfig(is);
+ Parser p = new AutoDetectParser(tikaConfig);
+ String text = getText(getResourceAsStream("/test-documents/testOCR.pdf"), p);
+ assertContains("Happy", text.trim());
+ }
+ }
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
similarity index 67%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
index 13f946781..1034d05eb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-rendering-mupdf-config.xml
@@ -18,18 +18,8 @@
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser"/>
- <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
- <params>
- <param name="maxFileSizeToOcr" type="long">100</param>
- </params>
- </parser>
- <parser class="org.apache.tika.parser.pdf.PDFParser">
- <params>
- <param name="extractInlineImages" type="bool">false</param>
- </params>
- </parser>
</parsers>
<renderers>
- <renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
+ <renderer class="org.apache.tika.renderer.pdf.MuPDFRenderer"/>
</renderers>
</properties>