You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/09 17:15:28 UTC
[tika] branch main updated: TIKA-3571 -- cleanup
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 678898a9d TIKA-3571 -- cleanup
678898a9d is described below
commit 678898a9d2a0f56b7639c245f0cd9b1842ccddd2
Author: tallison <ta...@apache.org>
AuthorDate: Mon May 9 13:15:07 2022 -0400
TIKA-3571 -- cleanup
---
.../java/org/apache/tika/metadata/PagedText.java | 2 +
.../java/org/apache/tika/metadata/Rendering.java | 1 -
.../{PagedText.java => TikaPagedText.java} | 21 ++++----
.../tika/renderer/PageBasedRenderResults.java | 4 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 56 ++++++++++++---------
.../tika/parser/pdf/ImageGraphicsEngine.java | 29 ++++++++---
.../java/org/apache/tika/parser/pdf/OCR2XHTML.java | 15 +++---
.../java/org/apache/tika/parser/pdf/PDF2XHTML.java | 18 +++----
.../tika/parser/pdf/PDFMarkedContent2XHTML.java | 12 ++---
.../java/org/apache/tika/parser/pdf/PDFParser.java | 19 +++----
.../renderer/pdf/{ => mutool}/MuPDFRenderer.java | 6 +--
.../pdf/pdfbox}/NoTextPDFRenderer.java | 2 +-
.../pdf/{ => pdfbox}/PDDocumentRenderer.java | 2 +-
.../renderer/pdf/{ => pdfbox}/PDFBoxRenderer.java | 7 ++-
.../pdf/{ => pdfbox}/PDFRenderingState.java | 2 +-
.../pdf/pdfbox}/TextOnlyPDFRenderer.java | 2 +-
.../pdf/pdfbox}/VectorGraphicsOnlyPDFRenderer.java | 2 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 9 ++++
.../apache/tika/parser/pdf/PDFRenderingTest.java | 26 +++++++++-
.../tika/parser/pdf/tika-rendering-config.xml | 2 +-
.../resources/test-documents/testPDF_rotated.pdf | Bin 0 -> 38309 bytes
21 files changed, 148 insertions(+), 89 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
index bafd972c9..4ba79090e 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
@@ -33,4 +33,6 @@ public interface PagedText {
*/
Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
+ //TODO MaxPageSize, Fonts, Colorants, PlateNames
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
index 73788fef3..31732c97a 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Rendering.java
@@ -21,7 +21,6 @@ package org.apache.tika.metadata;
public interface Rendering {
String RENDERING_PREFIX = "rendering:";
- Property PAGE_NUMBER = Property.externalInteger(RENDERING_PREFIX + "page_number");
Property RENDERED_BY = Property.externalTextBag(RENDERING_PREFIX + "Rendered-By");
Property RENDERED_MS = Property.externalReal(RENDERING_PREFIX + "rendering-time-ms");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
similarity index 61%
copy from tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
copy to tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
index bafd972c9..e4bf1454e 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
@@ -17,20 +17,19 @@
package org.apache.tika.metadata;
/**
- * XMP Paged-text schema. This is a collection of
- * {@link Property property definition} constants for the paged text
- * properties defined in the XMP standard.
+ * Metadata properties for paged text, metadata appropriate
+ * for an individual page (useful for embedded document handlers
+ * called on individual pages).
*
- * @see <a href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
- * >XMP Specification, Part 2: Standard Schemas</a>
- * @since Apache Tika 0.8
+ * Use {@link PagedText} where possible
*/
-public interface PagedText {
-
+public interface TikaPagedText {
+ String TIKA_PAGED_TEXT_PREFIX = "tika_pg:";
/**
- * "The number of pages in the document (including any in contained
- * documents)."
+ * 1-based page number for a specific page
*/
- Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
+ Property PAGE_NUMBER = Property.internalInteger(TIKA_PAGED_TEXT_PREFIX + "page_number");
+
+ Property PAGE_ROTATION = Property.internalRational(TIKA_PAGED_TEXT_PREFIX + "page_rotation");
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
index 0c238b60d..d80ff7c5c 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
@@ -22,7 +22,7 @@ import java.util.List;
import java.util.Map;
import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.metadata.Rendering;
+import org.apache.tika.metadata.TikaPagedText;
public class PageBasedRenderResults extends RenderResults {
@@ -32,7 +32,7 @@ public class PageBasedRenderResults extends RenderResults {
super(tmp);
}
public void add(RenderResult result) {
- Integer page = result.getMetadata().getInt(Rendering.PAGE_NUMBER);
+ Integer page = result.getMetadata().getInt(TikaPagedText.PAGE_NUMBER);
if (page != null) {
List<RenderResult> pageResults = results.get(page);
if (pageResults == null) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index f473e6f01..6a46f1339 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -92,6 +92,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.tools.imageio.ImageIOUtil;
import org.apache.pdfbox.util.Matrix;
import org.apache.pdfbox.util.Vector;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -104,8 +105,8 @@ import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Font;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.PDF;
-import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -115,8 +116,11 @@ import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.Renderer;
import org.apache.tika.renderer.RenderingTracker;
-import org.apache.tika.renderer.pdf.PDDocumentRenderer;
-import org.apache.tika.renderer.pdf.PDFRenderingState;
+import org.apache.tika.renderer.pdf.pdfbox.NoTextPDFRenderer;
+import org.apache.tika.renderer.pdf.pdfbox.PDDocumentRenderer;
+import org.apache.tika.renderer.pdf.pdfbox.PDFRenderingState;
+import org.apache.tika.renderer.pdf.pdfbox.TextOnlyPDFRenderer;
+import org.apache.tika.renderer.pdf.pdfbox.VectorGraphicsOnlyPDFRenderer;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -164,10 +168,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int unmappedUnicodeCharsPerPage = 0;
int totalCharsPerPage = 0;
- AbstractPDF2XHTML(PDDocument pdDocument, XHTMLContentHandler xhtml, ParseContext context,
+ AbstractPDF2XHTML(PDDocument pdDocument, ContentHandler handler, ParseContext context,
Metadata metadata, PDFParserConfig config) throws IOException {
this.pdDocument = pdDocument;
- this.xhtml = xhtml;
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
this.context = context;
this.metadata = metadata;
this.config = config;
@@ -470,7 +474,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- void doOCROnCurrentPage(PDFParserConfig.OCR_STRATEGY ocrStrategy)
+ void doOCROnCurrentPage(PDPage pdPage, PDFParserConfig.OCR_STRATEGY ocrStrategy)
throws IOException, TikaException, SAXException {
if (ocrStrategy.equals(NO_OCR)) {
return;
@@ -490,7 +494,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
try (TemporaryResources tmp = new TemporaryResources()) {
- RenderResult renderResult = renderCurrentPage(context, tmp);
+ RenderResult renderResult = renderCurrentPage(pdPage, context, tmp);
Metadata renderMetadata = renderResult.getMetadata();
try (InputStream is = TikaInputStream.get(renderResult.getPath())) {
renderMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
@@ -505,12 +509,13 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
- private RenderResult renderCurrentPage(ParseContext parseContext,
+ private RenderResult renderCurrentPage(PDPage pdPage, ParseContext parseContext,
TemporaryResources tmpResources)
throws IOException, TikaException {
PDFRenderingState renderingState = parseContext.get(PDFRenderingState.class);
if (renderingState == null) {
- noContextRenderCurrentPage(parseContext, tmpResources);
+ Metadata pageMetadata = getCurrentPageMetadata(pdPage);
+ noContextRenderCurrentPage(pageMetadata, parseContext, tmpResources);
}
//if the full document has already been rendered, then reuse that file
//TODO: we need to prevent this if only a portion of the page or portions
@@ -525,6 +530,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
return pageResults.get(0);
}
}
+ Metadata pageMetadata = getCurrentPageMetadata(pdPage);
Renderer thisRenderer = getPDFRenderer(config.getRenderer());
//if there's a configured renderer and if the rendering strategy is "all"
if (thisRenderer != null &&
@@ -533,25 +539,21 @@ class AbstractPDF2XHTML extends PDFTextStripper {
new PageRangeRequest(getCurrentPageNo(), getCurrentPageNo());
if (thisRenderer instanceof PDDocumentRenderer) {
try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
- Metadata m = new Metadata();
- m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
tis.setOpenContainer(pdDocument);
- return thisRenderer.render(tis, m, parseContext, pageRangeRequest)
+ return thisRenderer.render(tis, pageMetadata, parseContext, pageRangeRequest)
.getResults().get(0);
}
} else {
- Metadata m = new Metadata();
- m.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
PDFRenderingState state = context.get(PDFRenderingState.class);
if (state == null) {
throw new IllegalArgumentException("RenderingState must not be null");
}
return thisRenderer
- .render(state.getTikaInputStream(), m, parseContext, pageRangeRequest)
+ .render(state.getTikaInputStream(), pageMetadata, parseContext, pageRangeRequest)
.getResults().get(0);
}
} else {
- return noContextRenderCurrentPage(parseContext, tmpResources);
+ return noContextRenderCurrentPage(pageMetadata, parseContext, tmpResources);
}
}
@@ -568,7 +570,16 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
- private RenderResult noContextRenderCurrentPage(ParseContext parseContext,
+ private Metadata getCurrentPageMetadata(PDPage pdPage) {
+ Metadata pageMetadata = new Metadata();
+ pageMetadata.set(TikaCoreProperties.TYPE, PDFParser.MEDIA_TYPE.toString());
+ pageMetadata.set(TikaPagedText.PAGE_NUMBER, getCurrentPageNo());
+ pageMetadata.set(TikaPagedText.PAGE_ROTATION, (float)pdPage.getRotation());
+ return pageMetadata;
+ }
+
+ private RenderResult noContextRenderCurrentPage(Metadata pageMetadata,
+ ParseContext parseContext,
TemporaryResources tmpResources)
throws IOException, TikaException {
PDFRenderer renderer = null;
@@ -589,8 +600,6 @@ class AbstractPDF2XHTML extends PDFTextStripper {
int dpi = config.getOcrDPI();
Path tmpFile = null;
- Metadata m = new Metadata();
- m.set(Rendering.PAGE_NUMBER, pageIndex + 1);
RenderingTracker renderingTracker = parseContext.get(RenderingTracker.class);
if (renderingTracker == null) {
@@ -602,6 +611,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
BufferedImage image =
renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
+
tmpFile = tmpResources.createTempFile();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
//TODO: get output format from TesseractConfig
@@ -618,9 +628,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM,
ExceptionUtils.getStackTrace(e));
- return new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m);
+ return new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, pageMetadata);
}
- return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, m);
+ return new RenderResult(RenderResult.STATUS.SUCCESS, id, tmpFile, pageMetadata);
}
@Override
@@ -713,7 +723,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
- doOCROnCurrentPage(OCR_AND_TEXT_EXTRACTION);
+ doOCROnCurrentPage(page, OCR_AND_TEXT_EXTRACTION);
} else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) {
boolean unmappedExceedsLimit = false;
if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
@@ -728,7 +738,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() ||
unmappedExceedsLimit) {
- doOCROnCurrentPage(AUTO);
+ doOCROnCurrentPage(page, AUTO);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 63c382558..c600c81ec 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -114,7 +114,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
//nearly directly copied from PDFBox ExtractImages
- private static void writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG,
+ protected BufferedImage writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG,
OutputStream out) throws IOException, TikaException {
if ("jpg".equals(suffix)) {
@@ -129,12 +129,14 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
} finally {
IOUtils.closeQuietly(data);
}
+ return null;
} else {
BufferedImage image = pdImage.getImage();
if (image != null) {
// for CMYK and other "unusual" colorspaces, the JPEG will be converted
ImageIOUtil.writeImage(image, suffix, out);
}
+ return image;
}
} else if ("jp2".equals(suffix)) {
String colorSpaceName = pdImage.getColorSpace().getName();
@@ -148,6 +150,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
} finally {
IOUtils.closeQuietly(data);
}
+ return null;
} else {
// for CMYK and other "unusual" colorspaces, the image will be converted
BufferedImage image = pdImage.getImage();
@@ -155,11 +158,13 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
// for CMYK and other "unusual" colorspaces, the JPEG will be converted
ImageIOUtil.writeImage(image, "jpeg2000", out);
}
+ return image;
}
} else if ("tif".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE)) {
BufferedImage image = pdImage.getImage();
+ //TODO: log or otherwise report
if (image == null) {
- return;
+ return null;
}
// CCITT compressed images can have a different colorspace, but this one is B/W
// This is a bitonal image, so copy to TYPE_BYTE_BINARY
@@ -174,6 +179,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
}
ImageIOUtil.writeImage(bitonalImage, suffix, out);
+ return image;
} else if ("jb2".equals(suffix)) {
InputStream data = pdImage.createInputStream(JB2);
try {
@@ -184,12 +190,14 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
} else {
BufferedImage image = pdImage.getImage();
if (image == null) {
- return;
+ return null;
}
ImageIOUtil.writeImage(image, suffix, out);
+ return image;
}
out.flush();
+ return null;
}
private static void copyUpToMaxLength(InputStream is, OutputStream os)
@@ -254,6 +262,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
PDImageXObject xobject = (PDImageXObject) pdImage;
+ //TODO: handle image metadata: xobject.getMetadata()
Integer cachedNumber = processedInlineImages.get(xobject.getCOSObject());
if (cachedNumber != null && pdfParserConfig.isExtractUniqueInlineImagesOnly()) {
// skip duplicate image
@@ -365,7 +374,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
}
}
- private void processImage(PDImage pdImage, int imageNumber)
+ protected void processImage(PDImage pdImage, int imageNumber)
throws IOException, TikaException, SAXException {
//this is the metadata for this particular image
Metadata metadata = new Metadata();
@@ -392,12 +401,13 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
if (pdImage instanceof PDImageXObject) {
+ //extract the metadata contained outside of the image
PDMetadataExtractor
.extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext);
}
- //extract the metadata contained outside of the image
+ BufferedImage bufferedImage = null;
try {
- writeToBuffer(pdImage, suffix, useDirectJPEG, buffer);
+ bufferedImage = writeToBuffer(pdImage, suffix, useDirectJPEG, buffer);
} catch (MissingImageReaderException e) {
EmbeddedDocumentUtil.recordException(e, parentMetadata);
return;
@@ -405,9 +415,12 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
return;
}
- try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
+ try (TikaInputStream tis = TikaInputStream.get(buffer.toByteArray())) {
+ if (bufferedImage != null) {
+ tis.setOpenContainer(bufferedImage);
+ }
embeddedDocumentExtractor
- .parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), metadata,
+ .parseEmbedded(tis, new EmbeddedContentHandler(xhtml), metadata,
false);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
index 2658a484a..0adafe7fc 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/OCR2XHTML.java
@@ -23,12 +23,12 @@ import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -37,9 +37,9 @@ import org.apache.tika.sax.XHTMLContentHandler;
*/
class OCR2XHTML extends AbstractPDF2XHTML {
- private OCR2XHTML(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ private OCR2XHTML(PDDocument document, ContentHandler handler, ParseContext context,
Metadata metadata, PDFParserConfig config) throws IOException {
- super(document, xhtml, context, metadata, config);
+ super(document, handler, context, metadata, config);
}
/**
@@ -47,18 +47,19 @@ class OCR2XHTML extends AbstractPDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
- * @param xhtml SAX content handler
+ * @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ public static void process(PDDocument document, ContentHandler handler, ParseContext context,
Metadata metadata,
PDFParserConfig config)
throws SAXException, TikaException {
OCR2XHTML ocr2XHTML = null;
+
try {
- ocr2XHTML = new OCR2XHTML(document, xhtml, context, metadata, config);
+ ocr2XHTML = new OCR2XHTML(document, handler, context, metadata, config);
ocr2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
@@ -90,7 +91,7 @@ class OCR2XHTML extends AbstractPDF2XHTML {
public void processPage(PDPage pdPage) throws IOException {
try {
startPage(pdPage);
- doOCROnCurrentPage(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ doOCROnCurrentPage(pdPage, PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
endPage(pdPage);
} catch (TikaException | SAXException e) {
throw new IOException(e);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
index 602a8823e..93d1b7e81 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDF2XHTML.java
@@ -34,12 +34,12 @@ import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
@@ -63,9 +63,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
- PDF2XHTML(PDDocument document, XHTMLContentHandler xhtml, ParseContext context, Metadata metadata,
+ PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
- super(document, xhtml, context, metadata, config);
+ super(document, handler, context, metadata, config);
}
/**
@@ -73,12 +73,12 @@ class PDF2XHTML extends AbstractPDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
- * @param xhtml SAX content handler
+ * @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument document, XHTMLContentHandler xhtml, ParseContext context,
+ public static void process(PDDocument document, ContentHandler handler, ParseContext context,
Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
@@ -88,9 +88,9 @@ class PDF2XHTML extends AbstractPDF2XHTML {
// handler.
if (config.isDetectAngles()) {
pdf2XHTML =
- new AngleDetectingPDF2XHTML(document, xhtml, context, metadata, config);
+ new AngleDetectingPDF2XHTML(document, handler, context, metadata, config);
} else {
- pdf2XHTML = new PDF2XHTML(document, xhtml, context, metadata, config);
+ pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
}
config.configure(pdf2XHTML);
@@ -225,10 +225,10 @@ class PDF2XHTML extends AbstractPDF2XHTML {
private static class AngleDetectingPDF2XHTML extends PDF2XHTML {
- private AngleDetectingPDF2XHTML(PDDocument document, XHTMLContentHandler xhtml,
+ private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler,
ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
- super(document, xhtml, context, metadata, config);
+ super(document, handler, context, metadata, config);
}
@Override
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
index 3e4e1bf64..a3a49a367 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.java
@@ -41,12 +41,12 @@ import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructur
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.apache.pdfbox.text.TextPosition;
+import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
/**
* <p>This was added in Tika 1.24 as an alpha version of a text extractor
@@ -88,10 +88,10 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
//this stores state as we recurse through the structure tag tree
private State state = new State();
- private PDFMarkedContent2XHTML(PDDocument document, XHTMLContentHandler xhtml,
+ private PDFMarkedContent2XHTML(PDDocument document, ContentHandler handler,
ParseContext context, Metadata metadata, PDFParserConfig config)
throws IOException {
- super(document, xhtml, context, metadata, config);
+ super(document, handler, context, metadata, config);
}
/**
@@ -99,12 +99,12 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
* of XHTML SAX events sent to the given content handler.
*
* @param pdDocument PDF document
- * @param xhtml SAX content handler
+ * @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
- public static void process(PDDocument pdDocument, XHTMLContentHandler xhtml,
+ public static void process(PDDocument pdDocument, ContentHandler handler,
ParseContext context,
Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
@@ -112,7 +112,7 @@ public class PDFMarkedContent2XHTML extends PDF2XHTML {
PDFMarkedContent2XHTML pdfMarkedContent2XHTML = null;
try {
pdfMarkedContent2XHTML =
- new PDFMarkedContent2XHTML(pdDocument, xhtml, context, metadata, config);
+ new PDFMarkedContent2XHTML(pdDocument, handler, context, metadata, config);
} catch (IOException e) {
throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 28f796157..1cd99b927 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -70,8 +70,8 @@ import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderResult;
import org.apache.tika.renderer.RenderResults;
import org.apache.tika.renderer.Renderer;
-import org.apache.tika.renderer.pdf.PDFBoxRenderer;
-import org.apache.tika.renderer.pdf.PDFRenderingState;
+import org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer;
+import org.apache.tika.renderer.pdf.pdfbox.PDFRenderingState;
import org.apache.tika.sax.XHTMLContentHandler;
/**
@@ -170,21 +170,21 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
extractMetadata(pdfDocument, metadata, context);
AccessChecker checker = localConfig.getAccessChecker();
checker.check(metadata);
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
tstream.setOpenContainer(pdfDocument);
- handleRendering(pdfDocument, tstream, xhtml, metadata, context, localConfig);
+ handleRendering(pdfDocument, tstream, handler, metadata, context, localConfig);
if (handler != null) {
if (shouldHandleXFAOnly(hasXFA, localConfig)) {
- handleXFAOnly(pdfDocument, xhtml, metadata, context);
+ handleXFAOnly(pdfDocument, handler, metadata, context);
} else if (localConfig.getOcrStrategy()
.equals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY)) {
- OCR2XHTML.process(pdfDocument, xhtml, context, metadata, localConfig);
+ OCR2XHTML.process(pdfDocument, handler, context, metadata,
+ localConfig);
} else if (hasMarkedContent && localConfig.isExtractMarkedContent()) {
PDFMarkedContent2XHTML
- .process(pdfDocument, xhtml, context, metadata,
+ .process(pdfDocument, handler, context, metadata,
localConfig);
} else {
- PDF2XHTML.process(pdfDocument, xhtml, context, metadata,
+ PDF2XHTML.process(pdfDocument, handler, context, metadata,
localConfig);
}
}
@@ -478,10 +478,11 @@ public class PDFParser extends AbstractParser implements RenderingParser, Initia
return config.isIfXFAExtractOnlyXFA() && hasXFA;
}
- private void handleXFAOnly(PDDocument pdDocument, XHTMLContentHandler xhtml, Metadata metadata,
+ private void handleXFAOnly(PDDocument pdDocument, ContentHandler handler, Metadata metadata,
ParseContext context)
throws SAXException, IOException, TikaException {
XFAExtractor ex = new XFAExtractor();
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
try (InputStream is = new ByteArrayInputStream(
pdDocument.getDocumentCatalog().getAcroForm(null).getXFA().getBytes())) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
similarity index 97%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
index 983934677..dcf00279b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/MuPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/mutool/MuPDFRenderer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.renderer.pdf;
+package org.apache.tika.renderer.pdf.mutool;
import java.io.Closeable;
import java.io.File;
@@ -33,8 +33,8 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.renderer.PageBasedRenderResults;
@@ -111,7 +111,7 @@ public class MuPDFRenderer implements Renderer {
if (m.reset(n).find()) {
int pageIndex = Integer.parseInt(m.group(1));
Metadata renderMetadata = new Metadata();
- renderMetadata.set(Rendering.PAGE_NUMBER, pageIndex);
+ renderMetadata.set(TikaPagedText.PAGE_NUMBER, pageIndex);
renderMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
results.add(new RenderResult(RenderResult.STATUS.SUCCESS, tracker.getNextId(),
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/NoTextPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/NoTextPDFRenderer.java
similarity index 98%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/NoTextPDFRenderer.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/NoTextPDFRenderer.java
index c7874f4a0..8539b64ff 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/NoTextPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/NoTextPDFRenderer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pdf;
+package org.apache.tika.renderer.pdf.pdfbox;
import java.io.IOException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDDocumentRenderer.java
similarity index 96%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDDocumentRenderer.java
index 7cecd9a23..f6fd292f6 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDDocumentRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDDocumentRenderer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.renderer.pdf;
+package org.apache.tika.renderer.pdf.pdfbox;
import org.apache.tika.renderer.Renderer;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
similarity index 96%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index e5c5d8973..4313a1084 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.renderer.pdf;
+package org.apache.tika.renderer.pdf.pdfbox;
import java.awt.image.BufferedImage;
import java.io.IOException;
@@ -44,6 +44,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.renderer.PageBasedRenderResults;
@@ -135,7 +136,8 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.RENDERING.name());
try {
- m.set(Rendering.PAGE_NUMBER, i);
+ m.set(TikaPagedText.PAGE_NUMBER, i);
+ m.set(TikaPagedText.PAGE_ROTATION, (double)pdDocument.getPage(i - 1).getRotation());
Path imagePath = renderPage(renderer, id, i, m);
results.add(new RenderResult(RenderResult.STATUS.SUCCESS, id, imagePath, m));
} catch (IOException e) {
@@ -153,6 +155,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
"-" + id + "-" + pageNumber + "." + imageFormatName);
try {
long start = System.currentTimeMillis();
+ //TODO: parameterize whether or not to un-rotate page?
BufferedImage image = renderer.renderImageWithDPI(pageNumber - 1, dpi, imageType);
long renderingElapsed = System.currentTimeMillis() - start;
metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFRenderingState.java
similarity index 96%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFRenderingState.java
index 51ea0ae96..d38c502f6 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/PDFRenderingState.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFRenderingState.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.renderer.pdf;
+package org.apache.tika.renderer.pdf.pdfbox;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.renderer.RenderResults;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/TextOnlyPDFRenderer.java
similarity index 98%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/TextOnlyPDFRenderer.java
index f282d124c..fd4ec5571 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/TextOnlyPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/TextOnlyPDFRenderer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pdf;
+package org.apache.tika.renderer.pdf.pdfbox;
import java.awt.Graphics2D;
import java.awt.geom.Point2D;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/VectorGraphicsOnlyPDFRenderer.java
similarity index 99%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/VectorGraphicsOnlyPDFRenderer.java
index acd4b9485..4dca0241c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/VectorGraphicsOnlyPDFRenderer.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pdf;
+package org.apache.tika.renderer.pdf.pdfbox;
import java.awt.Graphics2D;
import java.io.IOException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 48238267e..88caa8175 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1210,6 +1210,15 @@ public class PDFParserTest extends TikaTest {
assertContains("transport mined materials", xml);
}
+ @Test
+ public void testAnglesOnPageRotation() throws Exception {
+ PDFParserConfig pdfParserConfig = new PDFParserConfig();
+ pdfParserConfig.setDetectAngles(true);
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ String xml = getXML("testPDF_rotated.pdf", parseContext).xml;
+ assertContains("until a further review indicates that the infrastructure", xml);
+ }
@Test
public void testUnmappedUnicodeStats() throws Exception {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
index ed61e2a02..e604a8179 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFRenderingTest.java
@@ -38,8 +38,8 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Rendering;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -65,7 +65,29 @@ public class PDFRenderingTest extends TikaTest {
assertEquals(2, metadataList.size());
Metadata tiffMetadata = metadataList.get(1);
assertEquals("RENDERING", tiffMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
- assertEquals(1, tiffMetadata.getInt(Rendering.PAGE_NUMBER));
+ assertEquals(1, tiffMetadata.getInt(TikaPagedText.PAGE_NUMBER));
+ }
+
+ @Test
+ public void testRotated() throws Exception {
+ ParseContext parseContext = configureParseContext();
+ TikaConfig config = getConfig("tika-rendering-config.xml");
+ Parser p = new AutoDetectParser(config);
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF_rotated.pdf", p, parseContext);
+ Map<Integer, byte[]> embedded =
+ ((RenderCaptureExtractor)parseContext.get(EmbeddedDocumentExtractor.class))
+ .getEmbedded();
+
+ assertEquals(1, embedded.size());
+ assertTrue(embedded.containsKey(0));
+ //what else can we do to test this? File type == tiff? Run OCR?
+ assertTrue(embedded.get(0).length > 1000);
+
+ assertEquals(2, metadataList.size());
+ Metadata tiffMetadata = metadataList.get(1);
+ assertEquals("RENDERING", tiffMetadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ assertEquals(1, tiffMetadata.getInt(TikaPagedText.PAGE_NUMBER));
+ assertEquals(90.0, Double.parseDouble(tiffMetadata.get(TikaPagedText.PAGE_ROTATION)), 0.1);
}
private TikaConfig getConfig(String path) throws TikaException, IOException, SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
index 5b1351662..80a9a4c73 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/org/apache/tika/parser/pdf/tika-rendering-config.xml
@@ -20,6 +20,6 @@
<parser class="org.apache.tika.parser.DefaultParser"/>
</parsers>
<renderers>
- <renderer class="org.apache.tika.renderer.pdf.PDFBoxRenderer"/>
+ <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
</renderers>
</properties>
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_rotated.pdf b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_rotated.pdf
new file mode 100644
index 000000000..9ca438c32
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/resources/test-documents/testPDF_rotated.pdf differ