You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/29 21:01:10 UTC
[tika] branch TIKA-3571 updated: TIKA-3571 -- initial proof of concept with a unit test.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3571
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3571 by this push:
new b669741f0 TIKA-3571 -- initial proof of concept with a unit test.
b669741f0 is described below
commit b669741f0b8b797543a2d419ce025cfb9d45a6e5
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 29 17:00:55 2022 -0400
TIKA-3571 -- initial proof of concept with a unit test.
---
.../tika/extractor/EmbeddedDocumentUtil.java | 4 +-
.../java/org/apache/tika/metadata/Rendering.java | 2 +-
.../apache/tika/renderer/CompositeRenderer.java | 26 ++--
.../{RenderResult.java => PageRangeRequest.java} | 50 ++++----
.../Rendering.java => renderer/RenderRequest.java} | 14 +--
.../org/apache/tika/renderer/RenderResult.java | 12 +-
.../java/org/apache/tika/renderer/Renderer.java | 15 ++-
.../org/apache/tika/renderer/RenderingState.java | 10 ++
.../org/apache/tika/renderer/RenderingTracker.java | 15 +++
.../tika-parser-pdf-module/pom.xml | 7 ++
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 106 ++++++++++------
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 15 +--
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 23 ++--
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 12 +-
.../java/org/apache/tika/parser/pdf/PDFParser.java | 140 +++++++++++++++++----
.../apache/tika/parser/pdf/PDFParserConfig.java | 87 +++++++------
.../tika/renderer/pdf/PDDocumentRenderer.java | 11 ++
.../apache/tika/renderer/pdf/PDFBoxRenderer.java | 99 +++++++++++----
.../tika/renderer/pdf/PDFRenderingState.java | 29 +++++
.../apache/tika/parser/pdf/PDFRenderingTest.java | 111 ++++++++++++++++
.../tika/parser/pdf/tika-rendering-config.xml | 35 ++++++
21 files changed, 630 insertions(+), 193 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index 5854aba28..ccac4f1db 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -92,7 +92,9 @@ public class EmbeddedDocumentUtil implements Serializable {
context.set(Parser.class, new AutoDetectParser(tikaConfig));
}
}
- return new ParsingEmbeddedDocumentExtractor(context);
+ EmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(context);
+ context.set(EmbeddedDocumentExtractor.class, ex);
+ return ex;
}
/**
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
index 1ff521aa7..73788fef3 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
@@ -23,5 +23,5 @@ public interface Rendering {
Property PAGE_NUMBER = Property.externalInteger(RENDERING_PREFIX + "page_number");
Property RENDERED_BY = Property.externalTextBag(RENDERING_PREFIX + "Rendered-By");
- Property RENDERED_MS = Property.externalInteger(RENDERING_PREFIX + "rendering-time-ms");
+ Property RENDERED_MS = Property.externalReal(RENDERING_PREFIX + "rendering-time-ms");
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
index ff5a52061..b5fb2acbb 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
@@ -39,14 +39,20 @@ import org.apache.tika.utils.ServiceLoaderUtils;
public class CompositeRenderer implements Renderer, Initializable {
private Map<MediaType, Renderer> rendererMap = new HashMap<>();
- private List<Renderer> renderers;
public CompositeRenderer(ServiceLoader serviceLoader) {
this(getDefaultRenderers(serviceLoader));
}
public CompositeRenderer(List<Renderer> renderers) {
- this.renderers = renderers;
+ Map<MediaType, Renderer> tmp = new ConcurrentHashMap<>();
+ ParseContext empty = new ParseContext();
+ for (Renderer renderer : renderers) {
+ for (MediaType mt : renderer.getSupportedTypes(empty)) {
+ tmp.put(mt, renderer);
+ }
+ }
+ rendererMap = Collections.unmodifiableMap(tmp);
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -54,8 +60,9 @@ public class CompositeRenderer implements Renderer, Initializable {
}
@Override
- public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext)
- throws IOException, TikaException {
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest... requests) throws IOException, TikaException {
+
String mediaTypeString = metadata.get(TikaCoreProperties.TYPE);
if (mediaTypeString == null) {
throw new TikaException("need to specify file type in metadata");
@@ -68,19 +75,12 @@ public class CompositeRenderer implements Renderer, Initializable {
if (renderer == null) {
throw new TikaException("I regret I can't find a renderer for " + mt);
}
- return renderer.render(is, metadata, parseContext);
+ return renderer.render(is, metadata, parseContext, requests);
}
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- Map<MediaType, Renderer> tmp = new ConcurrentHashMap<>();
- ParseContext empty = new ParseContext();
- for (Renderer renderer : renderers) {
- for (MediaType mt : renderer.getSupportedTypes(empty)) {
- tmp.put(mt, renderer);
- }
- }
- rendererMap = Collections.unmodifiableMap(tmp);
+
}
@Override
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
similarity index 50%
copy from tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
copy to tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
index 9ed61e342..2534d7032 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
@@ -16,37 +16,45 @@
*/
package org.apache.tika.renderer;
-import java.nio.file.Path;
+import java.util.Objects;
-import org.apache.tika.metadata.Metadata;
+/**
+ * The range of pages to render. These are 1-based, and "to" is inclusive.
+ */
+public class PageRangeRequest implements RenderRequest {
-public class RenderResult {
+ public static PageRangeRequest RENDER_ALL = new PageRangeRequest(1, -1);
- public enum STATUS {
- SUCCESS,
- EXCEPTION,
- TIMEOUT
- }
- private final STATUS status;
+ private final int from;
+ private final int to;
- private final Path path;
- private final Metadata metadata;
+ public PageRangeRequest(int from, int to) {
+ this.from = from;
+ this.to = to;
+ }
- public RenderResult(STATUS status, Path path, Metadata metadata) {
- this.status = status;
- this.path = path;
- this.metadata = metadata;
+ public int getFrom() {
+ return from;
}
- public Path getPath() {
- return path;
+ public int getTo() {
+ return to;
}
- public Metadata getMetadata() {
- return metadata;
+ @Override
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (o == null || getClass() != o.getClass()) {
+ return false;
+ }
+ PageRangeRequest that = (PageRangeRequest) o;
+ return from == that.from && to == that.to;
}
- public STATUS getStatus() {
- return status;
+ @Override
+ public int hashCode() {
+ return Objects.hash(from, to);
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
similarity index 67%
copy from tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
copy to tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
index 1ff521aa7..4e1f2f3cf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
@@ -13,15 +13,11 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- *
*/
+package org.apache.tika.renderer;
-package org.apache.tika.metadata;
-
-public interface Rendering {
- String RENDERING_PREFIX = "rendering:";
-
- Property PAGE_NUMBER = Property.externalInteger(RENDERING_PREFIX + "page_number");
- Property RENDERED_BY = Property.externalTextBag(RENDERING_PREFIX + "Rendered-By");
- Property RENDERED_MS = Property.externalInteger(RENDERING_PREFIX + "rendering-time-ms");
+/**
+ * Empty interface for requests to a renderer.
+ */
+public interface RenderRequest {
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
index 9ed61e342..888b0dd4c 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -29,11 +29,15 @@ public class RenderResult {
}
private final STATUS status;
+ private final int id;
private final Path path;
+ //TODO: we're relying on metadata to bring in a bunch of info.
+ //Might be cleaner to add specific parameters for page number, embedded path, etc.?
private final Metadata metadata;
- public RenderResult(STATUS status, Path path, Metadata metadata) {
+ public RenderResult(STATUS status, int id, Path path, Metadata metadata) {
this.status = status;
+ this.id = id;
this.path = path;
this.metadata = metadata;
}
@@ -49,4 +53,10 @@ public class RenderResult {
public STATUS getStatus() {
return status;
}
+
+ public int getId() {
+ return id;
+ }
+
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
index 5f7cb536b..bc4261f52 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
@@ -34,6 +34,7 @@ import org.apache.tika.parser.ParseContext;
public interface Renderer extends Serializable {
+
/**
* Returns the set of media types supported by this renderer when used
* with the given parse context.
@@ -44,6 +45,18 @@ public interface Renderer extends Serializable {
*/
Set<MediaType> getSupportedTypes(ParseContext context);
- RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext) throws IOException,
+ RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest ... requests) throws IOException,
+ TikaException;
+
+ /*
+ At some point, we might need/want to add something like this, where for a given
+ page the requestor or the parser determines that they only want to render e.g. a
+ box within a page.
+
+ RenderResults render(InputStream is, int page, Coordinates coordinates, Metadata metadata,
+ ParseContext parseContext) throws IOException,
TikaException;
+
+ */
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
new file mode 100644
index 000000000..1b3baf44e
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
@@ -0,0 +1,10 @@
+package org.apache.tika.renderer;
+
+/**
+ * This should be to track state for each file (embedded or otherwise).
+ * This should be reset in the parseContext at the beginning of a parse
+ * and then replaced at the end of the parse.
+ */
+public class RenderingState {
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
new file mode 100644
index 000000000..992b86f28
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -0,0 +1,15 @@
+package org.apache.tika.renderer;
+
+/**
+ * Use this in the ParseContext to keep track of unique ids for rendered
+ * images in embedded docs. This should be used for the full parse of
+ * a main document and its embedded document.
+ */
+public class RenderingTracker {
+
+ private int id = 0;
+
+ public synchronized int getNextId() {
+ return ++id;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
index 0175563b4..5e73120fd 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/pom.xml
@@ -74,6 +74,13 @@
<artifactId>jaxb-runtime</artifactId>
<version>${jaxb.version}</version>
</dependency>
+ <!-- incompatible with Apache license, only use in testing -->
+ <dependency>
+ <groupId>com.github.jai-imageio</groupId>
+ <artifactId>jai-imageio-core</artifactId>
+ <version>1.3.1</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 02449d53d..cb40569ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -105,11 +105,16 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
+import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.RenderingState;
+import org.apache.tika.renderer.pdf.PDFRenderingState;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -141,8 +146,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
final EmbeddedDocumentExtractor embeddedDocumentExtractor;
final PDFParserConfig config;
final Parser ocrParser;
-
- final RenderResults renderResults;
/**
* Format used for signature dates
* TODO Make this thread-safe
@@ -159,13 +162,12 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int unmappedUnicodeCharsPerPage = 0;
int totalCharsPerPage = 0;
- AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context,
- Metadata metadata, RenderResults renderResults, PDFParserConfig config) throws IOException {
+ AbstractPDF2XHTML(PDDocument pdDocument, XHTMLContentHandler xhtml, ParseContext context,
+ Metadata metadata, PDFParserConfig config) throws IOException {
this.pdDocument = pdDocument;
- this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.xhtml = xhtml;
this.context = context;
this.metadata = metadata;
- this.renderResults = renderResults;
this.config = config;
embeddedDocumentExtractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (config.getOcrStrategy() == NO_OCR) {
@@ -481,19 +483,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
"Please set the OCR_STRATEGY to NO_OCR or configure your" +
"OCR parser correctly");
} else if (ocrStrategy == AUTO) {
- //silently skip
+ //silently skip if there's no parser to run ocr
return;
}
}
try (TemporaryResources tmp = new TemporaryResources()) {
- Path tmpFile = renderPage(tmp);
-
- try (InputStream is = TikaInputStream.get(tmpFile)) {
- metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
+ RenderResult renderResult = renderCurrentPage(context, tmp);
+ Metadata renderMetadata = renderResult.getMetadata();
+ try (InputStream is = TikaInputStream.get(renderResult.getPath())) {
+ renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
ocrImageMediaType.toString());
ocrParser.parse(is, new EmbeddedContentHandler(new BodyContentHandler(xhtml)),
- metadata, context);
+ renderMetadata, context);
}
} catch (IOException e) {
handleCatchableIOE(e);
@@ -502,34 +504,68 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- private Path renderPage(TemporaryResources tmpResources) {
+ private RenderResult renderCurrentPage(ParseContext parseContext,
+ TemporaryResources tmpResources)
+ throws IOException, TikaException {
+ PDFRenderingState renderingState = parseContext.get(PDFRenderingState.class);
+ if (renderingState == null) {
+ noContextRenderCurrentPage(parseContext, tmpResources);
+ }
+ //if the full document has already been rendered, then reuse that file
+ RenderResults results = renderingState.getRenderResults();
+ if (results != null) {
+ for (RenderResult result : results.getResults()) {
+ int pageNo = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
+ if (getCurrentPageNo() == pageNo) {
+ return result;
+ }
+ }
+ }
+ //use the regular renderer if it isn't "no_text"
+ if (config.getOcrRenderingStrategy() != PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT) {
+ PageRangeRequest pageRangeRequest =
+ new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
+ try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+ tis.setOpenContainer(pdDocument);
+ return config.getRenderer().render(tis, metadata, parseContext, pageRangeRequest)
+ .getResults().get(0);
+ }
+ } else {
+ return noContextRenderCurrentPage(parseContext, tmpResources);
+ }
+ }
+
+
+ private RenderResult noContextRenderCurrentPage(ParseContext parseContext,
+ TemporaryResources tmpResources)
+ throws IOException, TikaException {
PDFRenderer renderer =
config.getOcrRenderingStrategy() == PDFParserConfig.OCR_RENDERING_STRATEGY.NO_TEXT ?
new NoTextPDFRenderer(pdDocument) : new PDFRenderer(pdDocument);
- try (TemporaryResources tmp = new TemporaryResources()) {
- int dpi = config.getOcrDPI();
- Path tmpFile = null;
- try {
- BufferedImage image =
- renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
- tmpFile = tmp.createTempFile();
- try (OutputStream os = Files.newOutputStream(tmpFile)) {
- //TODO: get output format from TesseractConfig
- ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, dpi,
- config.getOcrImageQuality());
- }
- } catch (SecurityException e) {
- //throw SecurityExceptions immediately
- throw e;
- } catch (IOException | RuntimeException e) {
- //image rendering can throw a variety of runtime exceptions, not just
- // IOExceptions...
- //need to have a wide catch
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
- ExceptionUtils.getStackTrace(e));
- return;
+ int dpi = config.getOcrDPI();
+ Path tmpFile = null;
+ try {
+ BufferedImage image =
+ renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+ tmpFile = tmpResources.createTempFile();
+ try (OutputStream os = Files.newOutputStream(tmpFile)) {
+ //TODO: get output format from TesseractConfig
+ ImageIOUtil.writeImage(image, config.getOcrImageFormatName(), os, dpi,
+ config.getOcrImageQuality());
}
+ } catch (SecurityException e) {
+ //throw SecurityExceptions immediately
+ throw e;
+ } catch (IOException | RuntimeException e) {
+ //image rendering can throw a variety of runtime exceptions, not just
+ // IOExceptions...
+ //need to have a wide catch
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
+ ExceptionUtils.getStackTrace(e));
+ return null;
+ }
+ return null;
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 1930a1ae9..534eea324 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -30,6 +30,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -38,9 +39,9 @@ import org.apache.tika.renderer.RenderResults;
*/
class OCR2XHTML extends AbstractPDF2XHTML {
- private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, RenderResults renderResults, PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, config);
+ private OCR2XHTML(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ Metadata metadata, PDFParserConfig config) throws IOException {
+ super(document, xhtml, context, metadata, config);
}
/**
@@ -48,18 +49,18 @@ class OCR2XHTML extends AbstractPDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
- * @param handler SAX content handler
+ * @param xhtml SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, RenderResults renderResults,
+ public static void process(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ Metadata metadata,
PDFParserConfig config)
throws SAXException, TikaException {
OCR2XHTML ocr2XHTML = null;
try {
- ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, renderResults, config);
+ ocr2XHTML = new OCR2XHTML(document, xhtml, context, metadata, config);
ocr2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 2b4f1368a..e493ea3ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -41,6 +41,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -64,9 +65,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
- PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
- RenderResults renderResults, PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, renderResults, config);
+ PDF2XHTML(PDDocument document, XHTMLContentHandler xhtml, ParseContext context, Metadata metadata,
+ PDFParserConfig config) throws IOException {
+ super(document, xhtml, context, metadata, config);
}
/**
@@ -74,14 +75,13 @@ class PDF2XHTML extends AbstractPDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
- * @param handler SAX content handler
+ * @param xhtml SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument document, ContentHandler handler, ParseContext context,
- Metadata metadata, RenderResults renderResults,
- PDFParserConfig config)
+ public static void process(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
try {
@@ -90,10 +90,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
// handler.
if (config.isDetectAngles()) {
pdf2XHTML =
- new AngleDetectingPDF2XHTML(document, handler, context, metadata, config);
+ new AngleDetectingPDF2XHTML(document, xhtml, context, metadata, config);
} else {
- pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, renderResults,
- config);
+ pdf2XHTML = new PDF2XHTML(document, xhtml, context, metadata, config);
}
config.configure(pdf2XHTML);
@@ -228,10 +227,10 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private static class AngleDetectingPDF2XHTML extends PDF2XHTML {
- private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler,
+ private AngleDetectingPDF2XHTML(PDDocument document, XHTMLContentHandler xhtml,
ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
- super(document, handler, context, metadata, config);
+ super(document, xhtml, context, metadata, config);
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 5ddf581d9..5c5ec6c03 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -47,6 +47,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
/**
* <p>This was added in Tika 1.24 as an alpha version of a text extractor
@@ -88,10 +89,10 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
//this stores state as we recurse through the structure tag tree
private State state = new State();
- private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler,
+ private PDFMarkedContent2XHTML(PDDocument document, XHTMLContentHandler xhtml,
ParseContext context, Metadata metadata, PDFParserConfig config)
throws IOException {
- super(document, handler, context, metadata, config);
+ super(document, xhtml, context, metadata, config);
}
/**
@@ -99,19 +100,20 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param pdDocument PDF document
- * @param handler SAX content handler
+ * @param xhtml SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument pdDocument, ContentHandler handler, ParseContext context,
+ public static void process(PDDocument pdDocument, XHTMLContentHandler xhtml,
+ ParseContext context,
Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
try {
pdfMarkedContent2XHTML =
- new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
+ new PDFMarkedContent2XHTML(pdDocument, xhtml, context, metadata, config);
} catch (IOException e) {
throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 79b79f14f..a16381437 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -54,6 +54,7 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.AccessPermissions;
import org.apache.tika.metadata.Metadata;
@@ -65,8 +66,13 @@ import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.RenderingParser;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.pdf.PDDocumentRenderer;
+import org.apache.tika.renderer.pdf.PDFBoxRenderer;
+import org.apache.tika.renderer.pdf.PDFRenderingState;
import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -131,12 +137,20 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
if (localConfig.isSetKCMS()) {
System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider");
}
-
+ initRenderer(localConfig);
PDDocument pdfDocument = null;
String password = "";
+ PDFRenderingState incomingRenderingState = context.get(PDFRenderingState.class);
try {
- TikaInputStream tstream = TikaInputStream.cast(stream);
+ TikaInputStream tstream;
+ if (shouldSpool(localConfig)) {
+ tstream = TikaInputStream.get(stream);
+ tstream.getPath();
+ context.set(PDFRenderingState.class, new PDFRenderingState(tstream));
+ } else {
+ tstream = TikaInputStream.cast(stream);
+ }
password = getPassword(metadata, context);
MemoryUsageSetting memoryUsageSetting = MemoryUsageSetting.setupMainMemoryOnly();
if (localConfig.getMaxMainMemoryBytes() >= 0) {
@@ -152,36 +166,26 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
pdfDocument = getPDDocument(new CloseShieldInputStream(stream), password,
memoryUsageSetting, metadata, context);
}
- tstream.setOpenContainer(pdfDocument);
- metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(pdfDocument.isEncrypted()));
-
- metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
+ boolean hasXFA = hasXFA(pdfDocument, metadata);
+ boolean hasMarkedContent = hasMarkedContent(pdfDocument, metadata);
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
- RenderResults renderResults = null;
- if (localConfig.getRenderer().getSupportedTypes(context).contains(MEDIA_TYPE)) {
- renderResults = renderPDF(tstream, context, localConfig);
- }
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ tstream.setOpenContainer(pdfDocument);
+ handleRendering(pdfDocument, tstream, xhtml, metadata, context, localConfig);
if (handler != null) {
- boolean hasXFA = hasXFA(pdfDocument);
- metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
- boolean hasMarkedContent = hasMarkedContent(pdfDocument);
- metadata.set(PDF.HAS_MARKED_CONTENT, Boolean.toString(hasMarkedContent));
- boolean hasCollection = hasCollection(pdfDocument);
- metadata.set(PDF.HAS_COLLECTION, Boolean.toString(hasCollection));
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
- handleXFAOnly(pdfDocument, handler, metadata, context);
+ handleXFAOnly(pdfDocument, xhtml, metadata, context);
} else if (localConfig.getOcrStrategy()
.equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
- OCR2XHTML.process(pdfDocument, handler, context, metadata, renderResults,
- localConfig);
+ OCR2XHTML.process(pdfDocument, xhtml, context, metadata, localConfig);
} else if (hasMarkedContent && localConfig.isExtractMarkedContent()) {
PDFMarkedContent2XHTML
- .process(pdfDocument, handler, context, metadata, renderResults,
+ .process(pdfDocument, xhtml, context, metadata,
localConfig);
} else {
- PDF2XHTML.process(pdfDocument, handler, context, metadata, renderResults,
+ PDF2XHTML.process(pdfDocument, xhtml, context, metadata,
localConfig);
}
}
@@ -189,18 +193,69 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
metadata.set(PDF.IS_ENCRYPTED, "true");
throw new EncryptedDocumentException(e);
} finally {
+ //replace the one that was here
+ context.set(PDFRenderingState.class, incomingRenderingState);
if (pdfDocument != null) {
pdfDocument.close();
}
}
}
+ private boolean shouldSpool(PDFParserConfig localConfig) {
+ if (localConfig.getImageStrategy() == PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+ return true;
+ }
+ if (localConfig.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.NO_OCR) {
+ return false;
+ }
+ //TODO: test that this is not AUTO with no OCR parser installed
+ return true;
+ }
+
+ private void handleRendering(PDDocument pdDocument, TikaInputStream tstream,
+ ContentHandler xhtml, Metadata parentMetadata,
+ ParseContext context,
+ PDFParserConfig config) {
+ if (config.getImageStrategy() != PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES) {
+ return;
+ }
+ Renderer renderer = config.getRenderer();
+ RenderResults renderResults = null;
+ try {
+ renderResults = renderPDF(tstream, context, config);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
+ return;
+ }
+ context.get(PDFRenderingState.class).setRenderResults(renderResults);
+ EmbeddedDocumentExtractor embeddedDocumentExtractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+
+ for (RenderResult result : renderResults.getResults()) {
+ if (result.getStatus() == RenderResult.STATUS.SUCCESS) {
+ if (embeddedDocumentExtractor.shouldParseEmbedded(result.getMetadata())) {
+ try (InputStream is = TikaInputStream.get(result.getPath())) {
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml, result.getMetadata(),
+ false);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordException(e, parentMetadata);
+ }
+ }
+ }
+ }
+ }
+
private RenderResults renderPDF(TikaInputStream tstream,
ParseContext parseContext, PDFParserConfig localConfig)
throws IOException, TikaException {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.TYPE, MEDIA_TYPE.toString());
- return localConfig.getRenderer().render(tstream, metadata, parseContext);
+ return localConfig.getRenderer().render(
+ tstream, metadata, parseContext, PageRangeRequest.RENDER_ALL);
}
@@ -216,7 +271,14 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
return PDDocument.load(path.toFile(), password, memoryUsageSetting);
}
+ private boolean hasMarkedContent(PDDocument pdDocument, Metadata metadata) {
+ boolean hasMarkedContent = hasMarkedContent(pdDocument);
+ metadata.set(PDF.HAS_MARKED_CONTENT, hasMarkedContent);
+ return hasMarkedContent;
+ }
+
private boolean hasMarkedContent(PDDocument pdDocument) {
+ boolean hasMarkedContent;
PDStructureTreeRoot root = pdDocument.getDocumentCatalog().getStructureTreeRoot();
if (root == null) {
return false;
@@ -238,6 +300,12 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
return false;
}
+ private boolean hasCollection(PDDocument pdDocument, Metadata metadata) {
+ boolean hasCollection = hasCollection(pdDocument);
+ metadata.set(PDF.HAS_COLLECTION, hasCollection);
+ return hasCollection;
+ }
+
private boolean hasCollection(PDDocument pdfDocument) {
COSDictionary cosDict = pdfDocument.getDocumentCatalog().getCOSObject();
if (cosDict.containsKey(COSName.COLLECTION)) {
@@ -270,6 +338,7 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
private void extractMetadata(PDDocument document, Metadata metadata, ParseContext context)
throws TikaException {
+ metadata.set(Metadata.CONTENT_TYPE, MEDIA_TYPE.toString());
//first extract AccessPermissions
AccessPermission ap = document.getCurrentAccessPermission();
@@ -284,6 +353,8 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
Boolean.toString(ap.canModifyAnnotations()));
metadata.set(AccessPermissions.CAN_PRINT, Boolean.toString(ap.canPrint()));
metadata.set(AccessPermissions.CAN_PRINT_DEGRADED, Boolean.toString(ap.canPrintDegraded()));
+ hasCollection(document, metadata);
+ metadata.set(PDF.IS_ENCRYPTED, Boolean.toString(document.isEncrypted()));
if (document.getDocumentCatalog().getLanguage() != null) {
metadata.set(TikaCoreProperties.LANGUAGE, document.getDocumentCatalog().getLanguage());
@@ -390,21 +461,22 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
}
- private boolean hasXFA(PDDocument pdDocument) {
- return pdDocument.getDocumentCatalog() != null &&
+ private boolean hasXFA(PDDocument pdDocument, Metadata metadata) {
+ boolean hasXFA = pdDocument.getDocumentCatalog() != null &&
pdDocument.getDocumentCatalog().getAcroForm(null) != null &&
pdDocument.getDocumentCatalog().getAcroForm(null).hasXFA();
+ metadata.set(PDF.HAS_XFA, Boolean.toString(hasXFA));
+ return hasXFA;
}
private boolean shouldHandleXFAOnly(boolean hasXFA, PDFParserConfig config) {
return config.isIfXFAExtractOnlyXFA() && hasXFA;
}
- private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata,
+ private void handleXFAOnly(PDDocument pdDocument, XHTMLContentHandler xhtml, Metadata metadata,
ParseContext context)
throws SAXException, IOException, TikaException {
XFAExtractor ex = new XFAExtractor();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try (InputStream is = new ByteArrayInputStream(
pdDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes())) {
@@ -641,11 +713,27 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
//no-op
}
+ private void initRenderer(PDFParserConfig config) {
+ if (config.getRenderer() != null) {
+ return;
+ }
+ //set a default renderer if nothing was defined
+ PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
+ pdfBoxRenderer.setDPI(defaultConfig.getOcrDPI());
+ pdfBoxRenderer.setImageType(defaultConfig.getOcrImageType());
+ pdfBoxRenderer.setImageFormatName(defaultConfig.getOcrImageFormatName());
+ config.setRenderer(pdfBoxRenderer);
+ }
+
@Override
public void setRenderer(Renderer renderer) {
defaultConfig.setRenderer(renderer);
}
+ public void setImageStrategy(String imageStrategy) {
+ defaultConfig.setImageStrategy(imageStrategy);
+ }
+
/**
* Copied from AcroformDefaultFixup minus generation of appearances and handling of orphan
* widgets, which we don't need.
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index 3b80ac061..72cc0e050 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -113,6 +113,10 @@ public class PDFParserConfig implements Serializable {
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;
+ /**
+ * Should the entire document be rendered?
+ */
+ private IMAGE_STRATEGY imageStrategy = IMAGE_STRATEGY.NONE;
private AccessChecker accessChecker = new AccessChecker();
//The PDFParser can throw IOExceptions if there is a problem
@@ -868,48 +872,21 @@ public class PDFParserConfig implements Serializable {
return renderer;
}
- @Override
- public int hashCode() {
- int result = (isEnableAutoSpace() ? 1 : 0);
- result = 31 * result + (isSuppressDuplicateOverlappingText() ? 1 : 0);
- result = 31 * result + (isExtractAnnotationText() ? 1 : 0);
- result = 31 * result + (isSortByPosition() ? 1 : 0);
- result = 31 * result + (isExtractAcroFormContent() ? 1 : 0);
- result = 31 * result + (isExtractBookmarksText() ? 1 : 0);
- result = 31 * result + (isExtractInlineImages() ? 1 : 0);
- result = 31 * result + (isExtractUniqueInlineImagesOnly() ? 1 : 0);
- result = 31 * result + getAverageCharTolerance().hashCode();
- result = 31 * result + getSpacingTolerance().hashCode();
- result = 31 * result + getDropThreshold().hashCode();
- result = 31 * result + (isIfXFAExtractOnlyXFA() ? 1 : 0);
- result = 31 * result + ocrStrategy.hashCode();
- result = 31 * result + getOcrDPI();
- result = 31 * result + getOcrImageType().hashCode();
- result = 31 * result + getOcrImageFormatName().hashCode();
- result = 31 * result + getAccessChecker().hashCode();
- result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0);
- result = 31 * result + (isExtractActions() ? 1 : 0);
- result = 31 * result + Long.valueOf(getMaxMainMemoryBytes()).hashCode();
- return result;
+ public void setImageStrategy(String imageStrategy) {
+ setImageStrategy(PDFParserConfig.IMAGE_STRATEGY.parse(imageStrategy));
}
- @Override
- public String toString() {
- return "PDFParserConfig{" + "enableAutoSpace=" + enableAutoSpace +
- ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText +
- ", extractAnnotationText=" + extractAnnotationText + ", sortByPosition=" +
- sortByPosition + ", extractAcroFormContent=" + extractAcroFormContent +
- ", extractBookmarksText=" + extractBookmarksText + ", extractInlineImages=" +
- extractInlineImages + ", extractUniqueInlineImagesOnly=" +
- extractUniqueInlineImagesOnly + ", averageCharTolerance=" + averageCharTolerance +
- ", spacingTolerance=" + spacingTolerance + ", dropThreshold=" + dropThreshold +
- ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", ocrStrategy=" + ocrStrategy +
- ", ocrDPI=" + ocrDPI + ", ocrImageType=" + ocrImageType + ", ocrImageFormatName='" +
- ocrImageFormatName + '\'' + ", accessChecker=" + accessChecker +
- ", extractActions=" + extractActions + ", catchIntermediateIOExceptions=" +
- catchIntermediateIOExceptions + ", maxMainMemoryBytes=" + maxMainMemoryBytes + '}';
+ public void setImageStrategy(IMAGE_STRATEGY imageStrategy) {
+ this.imageStrategy = imageStrategy;
+ userConfigured.add("imageStrategy");
+ }
+
+ public IMAGE_STRATEGY getImageStrategy() {
+ return imageStrategy;
}
+
+
public enum OCR_STRATEGY {
AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION;
@@ -972,8 +949,8 @@ public class PDFParserConfig implements Serializable {
public enum OCR_RENDERING_STRATEGY {
NO_TEXT, ALL; //AUTO?
- // Would TEXT_ONLY be useful in instances where the unicode mappings
- // are corrupt/non-existent?
+ // TODO: TEXT_ONLY be useful in instances where the unicode mappings are
+ // corrupt/non-existent
private static OCR_RENDERING_STRATEGY parse(String s) {
if (s == null) {
@@ -997,4 +974,34 @@ public class PDFParserConfig implements Serializable {
throw new IllegalArgumentException(sb.toString());
}
}
+
+ public enum IMAGE_STRATEGY {
+ NONE, RAW_IMAGES, RENDERED_PAGES;//TODO: add LOGICAL_IMAGES
+
+ private static IMAGE_STRATEGY parse(String s) {
+ String lc = s.toLowerCase(Locale.US);
+ switch (lc) {
+ case "rawImages" :
+ return RAW_IMAGES;
+ case "renderedPages":
+ return RENDERED_PAGES;
+ case "none":
+ return NONE;
+ default:
+ //fall through to exception
+ break;
+ }
+ StringBuilder sb = new StringBuilder();
+ sb.append("I regret that I don't recognize '").append(s);
+ sb.append("' as an IMAGE_STRATEGY. I only recognize:");
+ int i = 0;
+ for (IMAGE_STRATEGY strategy : IMAGE_STRATEGY.values()) {
+ if (i++ > 0) {
+ sb.append(", ");
+ }
+ sb.append(strategy.toString());
+ }
+ throw new IllegalArgumentException(sb.toString());
+ }
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
new file mode 100644
index 000000000..2c19d57c2
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
@@ -0,0 +1,11 @@
+package org.apache.tika.renderer.pdf;
+
+import org.apache.tika.renderer.Renderer;
+
+/**
+ * stub interface for the PDFParser to use to figure out if it needs
+ * to pass on the PDDocument or create a temp file to be used
+ * by a file-based renderer down the road.
+ */
+public interface PDDocumentRenderer extends Renderer {
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
index 4f7a1d149..31e5a9047 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
@@ -23,6 +23,7 @@ import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Collections;
+import java.util.Map;
import java.util.Set;
import org.apache.commons.io.IOExceptionWithCause;
@@ -31,6 +32,10 @@ import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TemporaryResources;
@@ -41,11 +46,14 @@ import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.renderer.PageRangeRequest;
+import org.apache.tika.renderer.RenderRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
import org.apache.tika.renderer.Renderer;
+import org.apache.tika.renderer.RenderingTracker;
-public class PDFBoxRenderer implements Renderer {
+public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("pdf"));
@@ -72,9 +80,10 @@ public class PDFBoxRenderer implements Renderer {
private ImageType imageType = ImageType.GRAY;
private String imageFormatName = "tiff";
+
@Override
- public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext) throws IOException,
- TikaException {
+ public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
+ RenderRequest... requests) throws IOException, TikaException {
PDDocument pdDocument;
@@ -88,21 +97,8 @@ public class PDFBoxRenderer implements Renderer {
}
RenderResults results = new RenderResults(new TemporaryResources());
try {
-
- PDFRenderer renderer = new PDFRenderer(pdDocument);
-
- for (int i = 0; i < pdDocument.getNumberOfPages(); i++) {
- Metadata m = new Metadata();
- m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
- TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
- try {
- m.set(Rendering.PAGE_NUMBER, i + 1);
- Path imagePath = renderPage(renderer, i, m);
- results.add(new RenderResult(RenderResult.STATUS.SUCCESS, imagePath, m));
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordException(e, m);
- results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, null, m));
- }
+ for (RenderRequest renderRequest : requests) {
+ processRequest(renderRequest, pdDocument, metadata, parseContext, results);
}
} finally {
if (mustClose) {
@@ -112,14 +108,52 @@ public class PDFBoxRenderer implements Renderer {
return results;
}
- private Path renderPage(PDFRenderer renderer, int pageIndex, Metadata metadata)
+ private void processRequest(RenderRequest renderRequest, PDDocument pdDocument,
+ Metadata metadata, ParseContext parseContext,
+ RenderResults results) {
+ if (renderRequest == PageRangeRequest.RENDER_ALL || renderRequest.equals(PageRangeRequest.RENDER_ALL)) {
+ renderRange(pdDocument, 1, pdDocument.getNumberOfPages(),
+ metadata, parseContext, results);
+ } else if (renderRequest instanceof PageRangeRequest) {
+ int start = ((PageRangeRequest)renderRequest).getFrom();
+ int toInclusive = ((PageRangeRequest)renderRequest).getTo();
+ renderRange(pdDocument, start, toInclusive, metadata, parseContext, results);
+ }
+ }
+
+ private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
+ ParseContext parseContext, RenderResults results) {
+ PDFRenderer renderer = new PDFRenderer(pdDocument);
+ RenderingTracker tracker = parseContext.get(RenderingTracker.class);
+ if (tracker == null) {
+ tracker = new RenderingTracker();
+ parseContext.set(RenderingTracker.class, tracker);
+ }
+ for (int i = start; i <= endInclusive; i++) {
+ int id = tracker.getNextId();
+ Metadata m = new Metadata();
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
+ TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
+ try {
+ m.set(Rendering.PAGE_NUMBER, i);
+ Path imagePath = renderPage(renderer, id, i, m);
+ results.add(new RenderResult(RenderResult.STATUS.SUCCESS, id, imagePath, m));
+ } catch (IOException e) {
+ EmbeddedDocumentUtil.recordException(e, m);
+ results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
+ }
+ }
+ }
+
+
+ private Path renderPage(PDFRenderer renderer, int id, int pageNumber, Metadata metadata)
throws IOException {
Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
- "-" + (pageIndex + 1) + "." + imageFormatName);
+ "-" + id + "-" + pageNumber + "." + imageFormatName);
try {
long start = System.currentTimeMillis();
- BufferedImage image = renderer.renderImageWithDPI(pageIndex, dpi, imageType);
+ BufferedImage image = renderer.renderImageWithDPI(pageNumber - 1, dpi, imageType);
long renderingElapsed = System.currentTimeMillis() - start;
metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
start = System.currentTimeMillis();
@@ -138,4 +172,27 @@ public class PDFBoxRenderer implements Renderer {
return tmpFile;
}
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {
+ //check file format names
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler problemHandler)
+ throws TikaConfigException {
+
+ }
+
+ public void setDPI(int dpi) {
+ this.dpi = dpi;
+ }
+
+
+ public void setImageType(ImageType imageType) {
+ this.imageType = imageType;
+ }
+
+ public void setImageFormatName(String imageFormatName) {
+ this.imageFormatName = imageFormatName;
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
new file mode 100644
index 000000000..2de00115b
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
@@ -0,0 +1,29 @@
+package org.apache.tika.renderer.pdf;
+
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.renderer.RenderResults;
+import org.apache.tika.renderer.RenderingState;
+
+public class PDFRenderingState extends RenderingState {
+
+ private TikaInputStream tis;
+
+ private RenderResults renderResults;
+
+ public PDFRenderingState(TikaInputStream tis) {
+ this.tis = tis;
+ }
+
+ public TikaInputStream getTikaInputStream() {
+ return tis;
+ }
+
+
+ public void setRenderResults(RenderResults renderResults) {
+ this.renderResults = renderResults;
+ }
+
+ public RenderResults getRenderResults() {
+ return renderResults;
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
new file mode 100644
index 000000000..326d625e7
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pdf;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.OpenOption;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardOpenOption;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.extractor.EmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+
+public class PDFRenderingTest extends TikaTest {
+
+
+ @Test
+ public void testBasic() throws Exception {
+ ParseContext parseContext = configureParseContext();
+ TikaConfig config = getConfig("tika-rendering-config.xml");
+ Parser p = new AutoDetectParser(config);
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p, parseContext);
+ Map<Integer, byte[]> embedded =
+ ((RenderCaptureExtractor)parseContext.get(EmbeddedDocumentExtractor.class))
+ .getEmbedded();
+
+ assertEquals(1, embedded.size());
+ assertTrue(embedded.containsKey(0));
+
+ assertEquals(2, metadataList.size());
+ Metadata tiffMetadata = metadataList.get(1);
+ assertEquals("RENDERING", tiffMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(1, tiffMetadata.getInt(Rendering.PAGE_NUMBER));
+ }
+
+ private TikaConfig getConfig(String path) throws TikaException, IOException, SAXException {
+ try (InputStream is = PDFRenderingTest.class.getResourceAsStream(path)) {
+ return new TikaConfig(is);
+ }
+ }
+
+ private ParseContext configureParseContext() {
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(EmbeddedDocumentExtractor.class, new RenderCaptureExtractor(parseContext));
+ PDFParserConfig config = new PDFParserConfig();
+ config.setImageStrategy(PDFParserConfig.IMAGE_STRATEGY.RENDERED_PAGES);
+ parseContext.set(PDFParserConfig.class, config);
+ return parseContext;
+ }
+
+
+ private class RenderCaptureExtractor extends ParsingEmbeddedDocumentExtractor {
+ private int count = 0;
+ Map<Integer, byte[]> embedded = new HashMap<>();
+
+ public RenderCaptureExtractor(ParseContext context) {
+ super(context);
+ }
+
+ public void parseEmbedded(InputStream stream, ContentHandler handler, Metadata metadata,
+ boolean outputHtml) throws SAXException, IOException {
+ TikaInputStream tstream = TikaInputStream.get(stream);
+ byte[] bytes = Files.readAllBytes(tstream.getPath());
+ embedded.put(count++, bytes);
+ try (InputStream is = Files.newInputStream(tstream.getPath())) {
+ super.parseEmbedded(is, handler, metadata, outputHtml);
+ }
+ }
+
+ public Map<Integer, byte[]> getEmbedded() {
+ return embedded;
+ }
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
new file mode 100644
index 000000000..13f946781
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ <parser class="org.apache.tika.parser.ocr.TesseractOCRParser">
+ <params>
+ <param name="maxFileSizeToOcr" type="long">100</param>
+ </params>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="extractInlineImages" type="bool">false</param>
+ </params>
+ </parser>
+ </parsers>
+ <renderers>
+ <renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
+ </renderers>
+</properties>