You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/29 14:44:46 UTC
[tika] branch main updated: TIKA-3804 -- improve configurability of PDFBox based renderers.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new faf284d20 TIKA-3804 -- improve configurability of PDFBox based renderers.
faf284d20 is described below
commit faf284d209c76e21284f62529ca36e4e56974c7d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 29 10:44:27 2022 -0400
TIKA-3804 -- improve configurability of PDFBox based renderers.
---
.../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java | 57 +++++--
.../org/apache/tika/server/core/CXFTestBase.java | 51 ++++++
.../tika/server/standard/UnpackerResourceTest.java | 57 +------
.../standard/UnpackerResourceWithConfigTest.java | 182 +++++++++++++++++++++
.../test/resources/config/tika-config-unpacker.xml | 32 ++++
5 files changed, 316 insertions(+), 63 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index 347c72e00..3f146ab15 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -50,6 +50,7 @@ import org.apache.tika.metadata.TikaPagedText;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.renderer.PageBasedRenderResults;
import org.apache.tika.renderer.PageRangeRequest;
import org.apache.tika.renderer.RenderRequest;
@@ -61,10 +62,11 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
Set<MediaType> SUPPORTED_TYPES = Collections.singleton(PDFParser.MEDIA_TYPE);
- private static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);
+ protected static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);
/**
* This is the amount of time it takes for PDFBox to render the page
+ * to a BufferedImage
*/
public static Property PDFBOX_RENDERING_TIME_MS =
Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-rendering-ms");
@@ -82,9 +84,9 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
return SUPPORTED_TYPES;
}
- private int dpi = 300;
- private ImageType imageType = ImageType.GRAY;
- private String imageFormatName = "tiff";
+ private int defaultDPI = 300;
+ private ImageType defaultImageType = ImageType.GRAY;
+ private String defaultImageFormatName = "png";
@Override
@@ -128,7 +130,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
}
private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
- ParseContext parseContext, PageBasedRenderResults results) {
+ ParseContext parseContext, PageBasedRenderResults results) {
PDFRenderer renderer = new PDFRenderer(pdDocument);
RenderingTracker tracker = parseContext.get(RenderingTracker.class);
if (tracker == null) {
@@ -143,7 +145,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
try {
m.set(TikaPagedText.PAGE_NUMBER, i);
m.set(TikaPagedText.PAGE_ROTATION, (double)pdDocument.getPage(i - 1).getRotation());
- results.add(renderPage(renderer, id, i, m));
+ results.add(renderPage(renderer, id, i, m, parseContext));
} catch (IOException e) {
EmbeddedDocumentUtil.recordException(e, m);
results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
@@ -151,22 +153,24 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
}
}
-
protected RenderResult renderPage(PDFRenderer renderer, int id, int pageNumber,
- Metadata metadata)
+ Metadata metadata, ParseContext parseContext)
throws IOException {
Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
- "-" + id + "-" + pageNumber + "." + imageFormatName);
+ "-" + id + "-" + pageNumber + "." + getImageFormatName(parseContext));
try {
long start = System.currentTimeMillis();
//TODO: parameterize whether or not to un-rotate page?
- BufferedImage image = renderer.renderImageWithDPI(pageNumber - 1, dpi, imageType);
+ BufferedImage image = renderer.renderImageWithDPI(
+ pageNumber - 1,
+ getDPI(parseContext),
+ getImageType(parseContext));
long renderingElapsed = System.currentTimeMillis() - start;
metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
start = System.currentTimeMillis();
try (OutputStream os = Files.newOutputStream(tmpFile)) {
- ImageIOUtil.writeImage(image, imageFormatName, os, dpi);
+ ImageIOUtil.writeImage(image, getImageFormatName(parseContext), os, getDPI(parseContext));
}
long elapsedWrite = System.currentTimeMillis() - start;
metadata.set(PDFBOX_IMAGE_WRITING_TIME_MS, elapsedWrite);
@@ -197,15 +201,38 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
}
public void setDPI(int dpi) {
- this.dpi = dpi;
+ this.defaultDPI = dpi;
}
-
public void setImageType(ImageType imageType) {
- this.imageType = imageType;
+ this.defaultImageType = imageType;
}
public void setImageFormatName(String imageFormatName) {
- this.imageFormatName = imageFormatName;
+ this.defaultImageFormatName = imageFormatName;
+ }
+
+ protected int getDPI(ParseContext parseContext) {
+ PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+ if (pdfParserConfig == null) {
+ return defaultDPI;
+ }
+ return pdfParserConfig.getOcrDPI();
+ }
+
+ protected ImageType getImageType(ParseContext parseContext) {
+ PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+ if (pdfParserConfig == null) {
+ return defaultImageType;
+ }
+ return pdfParserConfig.getOcrImageType();
+ }
+
+ protected String getImageFormatName(ParseContext parseContext) {
+ PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+ if (pdfParserConfig == null) {
+ return defaultImageFormatName;
+ }
+ return pdfParserConfig.getOcrImageFormatName();
}
}
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 8bfc02023..387b085a0 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
+import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@@ -260,4 +261,54 @@ public abstract class CXFTestBase {
return tmp;
}
+ protected static AverageColor getAverageColor(BufferedImage image, int minX, int maxX, int minY,
+ int maxY) {
+ long totalRed = 0;
+ long totalGreen = 0;
+ long totalBlue = 0;
+ int pixels = 0;
+ for (int x = minX; x < maxX; x++) {
+ for (int y = minY; y < maxY; y++) {
+ int clr = image.getRGB(x, y);
+ int red = (clr & 0x00ff0000) >> 16;
+ int green = (clr & 0x0000ff00) >> 8;
+ int blue = clr & 0x000000ff;
+ totalRed += red;
+ totalGreen += green;
+ totalBlue += blue;
+ pixels++;
+ }
+ }
+ return new AverageColor((double) totalRed / (double) pixels,
+ (double) totalGreen / (double) pixels, (double) totalBlue / (double) pixels);
+ }
+
+ public static class AverageColor {
+ double red;
+ double green;
+ double blue;
+
+ public AverageColor(double averageRed, double averageGreen, double averageBlue) {
+ this.red = averageRed;
+ this.green = averageGreen;
+ this.blue = averageBlue;
+ }
+
+ public double getRed() {
+ return red;
+ }
+
+ public double getGreen() {
+ return green;
+ }
+
+ public double getBlue() {
+ return blue;
+ }
+
+ @Override
+ public String toString() {
+ return "AverageColor{" + "red=" + red + ", green=" + green + ", blue=" + blue + '}';
+ }
+ }
}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index e7b1bf86c..cea91fb9b 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -266,18 +266,18 @@ public class UnpackerResourceTest extends CXFTestBase {
//top left
AverageColor averageColor =
getAverageColor(image, 0, image.getWidth() / 5, 0, image.getHeight() / 10);
- assertTrue(averageColor.red > 250);
- assertTrue(averageColor.green < 1);
- assertTrue(averageColor.blue < 1);
+ assertTrue(averageColor.getRed() > 250);
+ assertTrue(averageColor.getGreen() < 1);
+ assertTrue(averageColor.getBlue() < 1);
//bottom left = green
averageColor = getAverageColor(image, 0, image.getWidth() / 5,
image.getHeight() / 2 + image.getHeight() / 10,
image.getHeight() / 2 + 2 * image.getHeight() / 10);
- assertTrue(averageColor.red < 1);
- assertTrue(averageColor.green > 250);
- assertTrue(averageColor.blue < 1);
+ assertTrue(averageColor.getRed() < 1);
+ assertTrue(averageColor.getGreen() > 250);
+ assertTrue(averageColor.getBlue() < 1);
//bottom right = blue
averageColor = getAverageColor(image, image.getWidth() / 2 + image.getWidth() / 10,
@@ -285,48 +285,9 @@ public class UnpackerResourceTest extends CXFTestBase {
image.getHeight() / 2 + image.getHeight() / 10,
image.getHeight() / 2 + 2 * image.getHeight() / 10);
- assertTrue(averageColor.red < 1);
- assertTrue(averageColor.green < 1);
- assertTrue(averageColor.blue > 250);
- }
- }
-
- private static AverageColor getAverageColor(BufferedImage image, int minX, int maxX, int minY,
- int maxY) {
- long totalRed = 0;
- long totalGreen = 0;
- long totalBlue = 0;
- int pixels = 0;
- for (int x = minX; x < maxX; x++) {
- for (int y = minY; y < maxY; y++) {
- int clr = image.getRGB(x, y);
- int red = (clr & 0x00ff0000) >> 16;
- int green = (clr & 0x0000ff00) >> 8;
- int blue = clr & 0x000000ff;
- totalRed += red;
- totalGreen += green;
- totalBlue += blue;
- pixels++;
- }
- }
- return new AverageColor((double) totalRed / (double) pixels,
- (double) totalGreen / (double) pixels, (double) totalBlue / (double) pixels);
- }
-
- public static class AverageColor {
- double red;
- double green;
- double blue;
-
- public AverageColor(double averageRed, double averageGreen, double averageBlue) {
- this.red = averageRed;
- this.green = averageGreen;
- this.blue = averageBlue;
- }
-
- @Override
- public String toString() {
- return "AverageColor{" + "red=" + red + ", green=" + green + ", blue=" + blue + '}';
+ assertTrue(averageColor.getRed() < 1);
+ assertTrue(averageColor.getGreen() < 1);
+ assertTrue(averageColor.getBlue() > 250);
}
}
}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
new file mode 100644
index 000000000..c43630467
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.standard;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import javax.imageio.ImageIO;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.TikaServerParseExceptionMapper;
+import org.apache.tika.server.core.resource.UnpackerResource;
+import org.apache.tika.server.core.writer.TarWriter;
+import org.apache.tika.server.core.writer.ZipWriter;
+import org.apache.tika.server.standard.config.PDFServerConfig;
+
+public class UnpackerResourceWithConfigTest extends CXFTestBase {
+ private static final String BASE_PATH = "/unpack";
+ private static final String ALL_PATH = BASE_PATH + "/all";
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(UnpackerResource.class);
+ sf.setResourceProvider(UnpackerResource.class,
+ new SingletonResourceProvider(new UnpackerResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new TarWriter());
+ providers.add(new ZipWriter());
+ providers.add(new TikaServerParseExceptionMapper(false));
+ sf.setProviders(providers);
+ }
+
+ @Override
+ protected InputStream getTikaConfigInputStream() throws IOException {
+ return this.getClass().getResourceAsStream("/config/tika-config-unpacker.xml");
+ }
+
+ //Test that the PDFParser's renderer can be configured at parse time
+ //when specified in tika-config.xml
+ @Test
+ public void testPDFPerPageRenderColor() throws Exception {
+
+ //default is gray scale png; change to rgb and tiff
+ Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "imageStrategy",
+ "RenderPagesAtPageEnd")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageType", "rgb")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageFormatName", "tiff")
+ .accept("application/zip").put(ClassLoader.getSystemResourceAsStream(
+ "test-documents/testColorRendering.pdf"));
+ Map<String, byte[]> results = readZipArchiveBytes((InputStream) response.getEntity());
+ byte[] renderedImage = null;
+ for (Map.Entry<String, byte[]> e : results.entrySet()) {
+ if (e.getKey().startsWith("tika-pdfbox-rendering")) {
+ renderedImage = e.getValue();
+ break;
+ }
+ }
+ assertEquals("image/tiff",
+ TikaConfig.getDefaultConfig().getDetector()
+ .detect(new ByteArrayInputStream(renderedImage), new Metadata()).toString()
+ );
+
+ try (InputStream is = new ByteArrayInputStream(renderedImage)) {
+ BufferedImage image = ImageIO.read(is);
+ //top left
+ AverageColor averageColor =
+ getAverageColor(image, 0, image.getWidth() / 5, 0, image.getHeight() / 10);
+ assertTrue(averageColor.getRed() > 250);
+ assertTrue(averageColor.getGreen() < 1);
+ assertTrue(averageColor.getBlue() < 1);
+
+ //bottom left = green
+ averageColor = getAverageColor(image, 0, image.getWidth() / 5,
+ image.getHeight() / 2 + image.getHeight() / 10,
+ image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+ assertTrue(averageColor.getRed() < 1);
+ assertTrue(averageColor.getGreen() > 250);
+ assertTrue(averageColor.getBlue() < 1);
+
+ //bottom right = blue
+ averageColor = getAverageColor(image, image.getWidth() / 2 + image.getWidth() / 10,
+ image.getWidth() / 2 + 2 * image.getWidth() / 10,
+ image.getHeight() / 2 + image.getHeight() / 10,
+ image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+ assertTrue(averageColor.getRed() < 1);
+ assertTrue(averageColor.getGreen() < 1);
+ assertTrue(averageColor.getBlue() > 250);
+ }
+ }
+
+ @Test
+ public void testPDFPerPageRenderGray() throws Exception {
+
+
+ Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "imageStrategy",
+ "RenderPagesAtPageEnd")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageType", "gray")
+ .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageFormatName", "jpeg")
+ .accept("application/zip").put(ClassLoader.getSystemResourceAsStream(
+ "test-documents/testColorRendering.pdf"));
+ Map<String, byte[]> results = readZipArchiveBytes((InputStream) response.getEntity());
+ byte[] renderedImage = null;
+ for (Map.Entry<String, byte[]> e : results.entrySet()) {
+ if (e.getKey().startsWith("tika-pdfbox-rendering")) {
+ renderedImage = e.getValue();
+ break;
+ }
+ }
+ assertEquals("image/jpeg",
+ TikaConfig.getDefaultConfig().getDetector()
+ .detect(new ByteArrayInputStream(renderedImage), new Metadata()).toString()
+ );
+
+ try (InputStream is = new ByteArrayInputStream(renderedImage)) {
+ BufferedImage image = ImageIO.read(is);
+ //top left
+ AverageColor averageColor =
+ getAverageColor(image, 0, image.getWidth() / 5, 0, image.getHeight() / 10);
+
+ assertTrue(averageColor.getRed() > 140 && averageColor.getRed() < 160);
+ assertTrue(averageColor.getGreen() > 140 && averageColor.getGreen() < 160);
+ assertTrue(averageColor.getBlue() > 140 && averageColor.getBlue() < 160);
+
+ //bottom left = green
+ averageColor = getAverageColor(image, 0, image.getWidth() / 5,
+ image.getHeight() / 2 + image.getHeight() / 10,
+ image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+ assertTrue(averageColor.getRed() < 210 && averageColor.getRed() > 190);
+ assertTrue(averageColor.getGreen() < 210 && averageColor.getGreen() > 190);
+ assertTrue(averageColor.getBlue() < 210 && averageColor.getBlue() > 190);
+
+ //bottom right = blue
+ averageColor = getAverageColor(image, image.getWidth() / 2 + image.getWidth() / 10,
+ image.getWidth() / 2 + 2 * image.getWidth() / 10,
+ image.getHeight() / 2 + image.getHeight() / 10,
+ image.getHeight() / 2 + 2 * image.getHeight() / 10);
+ assertTrue(averageColor.getRed() < 100 && averageColor.getRed() > 90);
+ assertTrue(averageColor.getGreen() < 100 && averageColor.getGreen() > 90);
+ assertTrue(averageColor.getBlue() < 100 && averageColor.getBlue() > 90);
+ }
+ }
+
+}
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-unpacker.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-unpacker.xml
new file mode 100644
index 000000000..0e3c103b8
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-unpacker.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+ <renderers>
+ <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
+ </renderers>
+</properties>
\ No newline at end of file