You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/12 14:03:19 UTC
[tika] branch main updated: TIKA-3755 -- make renderresult more flexible to allow for an open container or an inputstream; add image strategy end of page
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 8ce439ce1 TIKA-3755 -- make renderresult more flexible to allow for an open container or an inputstream; add image strategy end of page
new 7b79e0b6e Merge remote-tracking branch 'origin/main' into main
8ce439ce1 is described below
commit 8ce439ce15b2f16292cac340f5d50b16c7b3c158
Author: tallison <ta...@apache.org>
AuthorDate: Thu May 12 10:02:08 2022 -0400
TIKA-3755 -- make renderresult more flexible to allow for an open container or an inputstream; add image strategy end of page
---
.../org/apache/tika/renderer/RenderResult.java | 42 +++++++++++++++++---
.../org/apache/tika/renderer/RenderResults.java | 8 +---
.../org/apache/tika/renderer/RenderingTracker.java | 3 ++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 24 ++++++------
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 45 +++++++++++++++++++++-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 29 ++++++++------
.../apache/tika/parser/pdf/PDFParserConfig.java | 28 ++++++++++++--
.../tika/renderer/pdf/mutool/MuPDFRenderer.java | 2 +-
.../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 22 ++++++++---
.../apache/tika/parser/pdf/PDFRenderingTest.java | 9 +++--
.../tika/parser/pdf/tika-rendering-config.xml | 13 ++++++-
...nfig.xml => tika-rendering-per-page-config.xml} | 13 +++++--
12 files changed, 181 insertions(+), 57 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
index 888b0dd4c..3fd8d7d2c 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -16,11 +16,17 @@
*/
package org.apache.tika.renderer;
+import java.io.Closeable;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
import java.nio.file.Path;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-public class RenderResult {
+public class RenderResult implements Closeable {
public enum STATUS {
SUCCESS,
@@ -30,20 +36,40 @@ public class RenderResult {
private final STATUS status;
private final int id;
- private final Path path;
+
+ private final Object result;
+
//TODO: we're relying on metadata to bring in a bunch of info.
//Might be cleaner to add specific parameters for page number, embedded path, etc.?
private final Metadata metadata;
- public RenderResult(STATUS status, int id, Path path, Metadata metadata) {
+ TemporaryResources tmp = new TemporaryResources();
+
+ public RenderResult(STATUS status, int id, Object result, Metadata metadata) {
this.status = status;
this.id = id;
- this.path = path;
+ this.result = result;
this.metadata = metadata;
+ if (result instanceof Path) {
+ tmp.addResource(new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete((Path)result);
+ }
+ });
+ } else if (result instanceof Closeable) {
+ tmp.addResource((Closeable) result);
+ }
}
- public Path getPath() {
- return path;
+ public InputStream getInputStream() throws IOException {
+ if (result instanceof Path) {
+ return TikaInputStream.get((Path)result, metadata);
+ } else {
+ TikaInputStream tis = TikaInputStream.get(new byte[0]);
+ tis.setOpenContainer(result);
+ return tis;
+ }
}
public Metadata getMetadata() {
@@ -58,5 +84,9 @@ public class RenderResult {
return id;
}
+ @Override
+ public void close() throws IOException {
+ tmp.close();
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
index 12d60d3da..108c06260 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
@@ -18,7 +18,6 @@ package org.apache.tika.renderer;
import java.io.Closeable;
import java.io.IOException;
-import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
@@ -33,12 +32,7 @@ public class RenderResults implements Closeable {
this.tmp = tmp;
}
public void add(RenderResult result) {
- tmp.addResource(new Closeable() {
- @Override
- public void close() throws IOException {
- Files.delete(result.getPath());
- }
- });
+ tmp.addResource(result);
results.add(result);
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
index 49c775e69..2e3143261 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -20,6 +20,9 @@ package org.apache.tika.renderer;
* Use this in the ParseContext to keep track of unique ids for rendered
* images in embedded docs. This should be used for the full parse of
* a main document and its embedded document.
+ *
+ * This is different from RenderingState, which is used to track
+ * rendering per file/per embedded doc.
*/
public class RenderingTracker {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 6a46f1339..13ccd70b8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -494,13 +494,14 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
try (TemporaryResources tmp = new TemporaryResources()) {
- RenderResult renderResult = renderCurrentPage(pdPage, context, tmp);
- Metadata renderMetadata = renderResult.getMetadata();
- try (InputStream is = TikaInputStream.get(renderResult.getPath())) {
- renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
- ocrImageMediaType.toString());
- ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- renderMetadata, context);
+ try (RenderResult renderResult = renderCurrentPage(pdPage, context, tmp)) {
+ Metadata renderMetadata = renderResult.getMetadata();
+ try (InputStream is = renderResult.getInputStream()) {
+ renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+ ocrImageMediaType.toString());
+ ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
+ renderMetadata, context);
+ }
}
} catch (IOException e) {
handleCatchableIOE(e);
@@ -538,11 +539,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
PageRangeRequest pageRangeRequest =
new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
if (thisRenderer instanceof PDDocumentRenderer) {
- try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
- tis.setOpenContainer(pdDocument);
- return thisRenderer.render(tis, pageMetadata, parseContext, pageRangeRequest)
+ //do not do autocloseable. We need to leave the pdDocument open!
+ TikaInputStream tis = TikaInputStream.get(new byte[0]);
+ tis.setOpenContainer(pdDocument);
+ return thisRenderer.render(tis, pageMetadata, parseContext, pageRangeRequest)
.getResults().get(0);
- }
+
} else {
PDFRenderingState state = context.get(PDFRenderingState.class);
if (state == null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index fb2de2bc2..9179fe956 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -17,6 +17,7 @@
package org.apache.tika.parser.pdf;
import java.io.IOException;
+import java.io.InputStream;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
@@ -25,6 +26,7 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
+import org.apache.commons.io.IOExceptionWithCause;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
@@ -38,9 +40,17 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.image.ImageGraphicsEngine;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
+import org.apache.tika.renderer.RenderResult;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.pdf.pdfbox.PDFRenderingState;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -137,6 +147,7 @@ class PDF2XHTML extends AbstractPDF2XHTML {
writeParagraphEnd();
try {
extractImages(page);
+ renderPage(page);
} catch (IOException e) {
handleCatchableIOE(e);
}
@@ -148,12 +159,44 @@ class PDF2XHTML extends AbstractPDF2XHTML {
}
}
+ private void renderPage(PDPage page) throws IOException {
+ if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_AT_PAGE_END) {
+ return;
+ }
+ PDFRenderingState state = context.get(PDFRenderingState.class);
+ //this is the document's inputstream/PDDocument
+ //TODO: figure out if we can send in the PDPage in the TikaInputStream
+ TikaInputStream tis = state.getTikaInputStream();
+ Renderer renderer = config.getRenderer();
+ RenderRequest request = new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
+ Metadata renderedMetadata = new Metadata();
+ renderedMetadata.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+ try (RenderResults results = renderer.render(tis, renderedMetadata, context, request)) {
+ for (RenderResult result : results.getResults()) {
+ if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
+ if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
+
+ try (InputStream is = result.getInputStream()) {
+ //TODO: add markup here?
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml,
+ result.getMetadata(), true);
+ }
+ }
+ }
+ }
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ handleCatchableIOE(new IOExceptionWithCause(e));
+ }
+ }
+
void extractImages(PDPage page) throws SAXException, IOException {
if (config.isExtractInlineImages() == false &&
config.isExtractInlineImageMetadataOnly() == false) {
return;
}
-
+ //TODO: modernize to ImageStratey != rawImages
ImageGraphicsEngine engine =
config.getImageGraphicsEngineFactory().newEngine(
page, getCurrentPageNo(), embeddedDocumentExtractor, config,
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 5ad30b37e..a4e293a58 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -114,7 +114,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
* @deprecated Supply a {@link PasswordProvider} on the {@link ParseContext} instead
*/
public static final String PASSWORD = "org.apache.tika.parser.pdf.password";
- protected static final MediaType MEDIA_TYPE = MediaType.application("pdf");
+ public static final MediaType MEDIA_TYPE = MediaType.application("pdf");
/**
* Serial version UID
*/
@@ -137,7 +137,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
if (localConfig.isSetKCMS()) {
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
}
- initRenderer(localConfig);
+ initRenderer(localConfig, context);
PDDocument pdfDocument = null;
String password = "";
@@ -172,7 +172,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
tstream.setOpenContainer(pdfDocument);
- handleRendering(pdfDocument, tstream, handler, metadata, context, localConfig);
+ renderPagesBeforeParse(tstream, handler, metadata, context, localConfig);
if (handler != null) {
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
handleXFAOnly(pdfDocument, handler, metadata, context);
@@ -209,9 +209,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
}
private boolean shouldSpool(PDFParserConfig localConfig) {
- if (localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+ if (localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_BEFORE_PARSE
+ || localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_AT_PAGE_END) {
return true;
}
+
if (localConfig.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.NO_OCR) {
return false;
}
@@ -219,11 +221,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
return true;
}
- private void handleRendering(PDDocument pdDocument, TikaInputStream tstream,
- ContentHandler xhtml, Metadata parentMetadata,
- ParseContext context,
- PDFParserConfig config) {
- if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+ private void renderPagesBeforeParse(TikaInputStream tstream,
+ ContentHandler xhtml, Metadata parentMetadata,
+ ParseContext context,
+ PDFParserConfig config) {
+ if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDER_PAGES_BEFORE_PARSE) {
return;
}
RenderResults renderResults = null;
@@ -242,7 +244,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
for (RenderResult result : renderResults.getResults()) {
if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
- try (InputStream is = TikaInputStream.get(result.getPath())) {
+ try (InputStream is = result.getInputStream()) {
embeddedDocumentExtractor.parseEmbedded(is, xhtml, result.getMetadata(),
false);
} catch (SecurityException e) {
@@ -720,8 +722,10 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
//no-op
}
- private void initRenderer(PDFParserConfig config) {
- if (config.getRenderer() != null) {
+ private void initRenderer(PDFParserConfig config, ParseContext context) {
+
+ if (config.getRenderer() != null &&
+ config.getRenderer().getSupportedTypes(context).contains(MEDIA_TYPE)) {
return;
}
//set a default renderer if nothing was defined
@@ -742,6 +746,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
defaultConfig.setImageGraphicsEngineFactory(imageGraphicsEngineFactory);
}
+ @Field
public void setImageStrategy(String imageStrategy) {
defaultConfig.setImageStrategy(imageStrategy);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index d17f7ed88..da07b2b12 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -987,15 +987,35 @@ public class PDFParserConfig implements Serializable {
}
public enum IMAGE_STRATEGY {
- NONE, RAW_IMAGES, RENDERED_PAGES;//TODO: add LOGICAL_IMAGES
+ NONE,
+ /**
+ * This is the more modern version of {@link PDFParserConfig#extractInlineImages}
+ */
+ RAW_IMAGES,
+ /**
+ * If you want the rendered images, and you don't care that there's
+ * markup in the xhtml handler per page then go with this option.
+ * For some rendering engines, it is faster to render the full document
+ * upfront than to parse a page, render a page, etc.
+ */
+ RENDER_PAGES_BEFORE_PARSE,
+ /**
+ * This renders each page, one at a time, at the end of the page.
+ * For some rendering engines, this may be slower, but it allows the writing
+ * of image metadata into the xhtml in the proper location
+ */
+ RENDER_PAGES_AT_PAGE_END;
+ //TODO: add LOGICAL_IMAGES
private static IMAGE_STRATEGY parse(String s) {
String lc = s.toLowerCase(Locale.US);
switch (lc) {
- case "rawImages" :
+ case "rawimages" :
return RAW_IMAGES;
- case "renderedPages":
- return RENDERED_PAGES;
+ case "renderpagesbeforeparse":
+ return RENDER_PAGES_BEFORE_PARSE;
+ case "renderpagesatpageend":
+ return RENDER_PAGES_AT_PAGE_END;
case "none":
return NONE;
default:
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
index dcf00279b..d108ba54b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
@@ -115,7 +115,7 @@ public class MuPDFRenderer implements Renderer {
renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(),
- f.toPath(), renderMetadata));
+ TikaInputStream.get(f.toPath()), renderMetadata));
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index 4313a1084..347c72e00 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -31,6 +31,8 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -47,6 +49,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.pdf.PDFParser;
import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderRequest;
@@ -56,7 +59,9 @@ import org.apache.tika.renderer.RenderingTracker;
public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
- Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
+ Set<MediaType> SUPPORTED_TYPES = Collections.singleton(PDFParser.MEDIA_TYPE);
+
+ private static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);
/**
* This is the amount of time it takes for PDFBox to render the page
@@ -138,8 +143,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
try {
m.set(TikaPagedText.PAGE_NUMBER, i);
m.set(TikaPagedText.PAGE_ROTATION, (double)pdDocument.getPage(i - 1).getRotation());
- Path imagePath = renderPage(renderer, id, i, m);
- results.add(new RenderResult(RenderResult.STATUS.SUCCESS, id, imagePath, m));
+ results.add(renderPage(renderer, id, i, m));
} catch (IOException e) {
EmbeddedDocumentUtil.recordException(e, m);
results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
@@ -148,7 +152,8 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
}
- private Path renderPage(PDFRenderer renderer, int id, int pageNumber, Metadata metadata)
+ protected RenderResult renderPage(PDFRenderer renderer, int id, int pageNumber,
+ Metadata metadata)
throws IOException {
Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
@@ -169,10 +174,15 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
} catch (SecurityException e) {
//throw SecurityExceptions immediately
throw e;
- } catch (IOException | RuntimeException e) {
+ } catch (Exception e) {
+ try {
+ Files.delete(tmpFile);
+ } catch (IOException ex) {
+ LOG.warn("couldn't delete " + tmpFile, ex);
+ }
throw new IOExceptionWithCause(e);
}
- return tmpFile;
+ return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, metadata);
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index e604a8179..08d18b6c1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -46,6 +46,11 @@ import org.apache.tika.parser.Parser;
public class PDFRenderingTest extends TikaTest {
+ @Test
+ public void testDefault() throws Exception {
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf");
+ assertEquals(1, metadataList.size());
+ }
@Test
public void testBasic() throws Exception {
@@ -56,7 +61,6 @@ public class PDFRenderingTest extends TikaTest {
Map<Integer, byte[]> embedded =
((RenderCaptureExtractor)parseContext.get(EmbeddedDocumentExtractor.class))
.getEmbedded();
-
assertEquals(1, embedded.size());
assertTrue(embedded.containsKey(0));
//what else can we do to test this? File type == tiff? Run OCR?
@@ -99,9 +103,6 @@ public class PDFRenderingTest extends TikaTest {
private ParseContext configureParseContext() {
ParseContext parseContext = new ParseContext();
parseContext.set(EmbeddedDocumentExtractor.class, new RenderCaptureExtractor(parseContext));
- PDFParserConfig config = new PDFParserConfig();
- config.setImageStrategy(PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES);
- parseContext.set(PDFParserConfig.class, config);
return parseContext;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
index 80a9a4c73..92f351bb9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -17,9 +17,18 @@
-->
<properties>
<parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="imageStrategy" type="string">renderPagesBeforeParse</param>
+ </params>
+ </parser>
</parsers>
+<!--
+ This will be supplied automatically if not specified.
<renderers>
<renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
- </renderers>
+ </renderers> -->
</properties>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
similarity index 70%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
index 80a9a4c73..e3f92df6c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-per-page-config.xml
@@ -17,9 +17,16 @@
-->
<properties>
<parsers>
- <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="imageStrategy" type="string">renderPagesAtPageEnd</param>
+ </params>
+ </parser>
</parsers>
- <renderers>
+<!-- <renderers>
<renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
- </renderers>
+ </renderers> -->
</properties>