You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/29 14:44:46 UTC

[tika] branch main updated: TIKA-3804 -- improve configurability of PDFBox based renderers.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new faf284d20 TIKA-3804 -- improve configurability of PDFBox based renderers.
faf284d20 is described below

commit faf284d209c76e21284f62529ca36e4e56974c7d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 29 10:44:27 2022 -0400

    TIKA-3804 -- improve configurability of PDFBox based renderers.
---
 .../tika/renderer/pdf/pdfbox/PDFBoxRenderer.java   |  57 +++++--
 .../org/apache/tika/server/core/CXFTestBase.java   |  51 ++++++
 .../tika/server/standard/UnpackerResourceTest.java |  57 +------
 .../standard/UnpackerResourceWithConfigTest.java   | 182 +++++++++++++++++++++
 .../test/resources/config/tika-config-unpacker.xml |  32 ++++
 5 files changed, 316 insertions(+), 63 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
index 347c72e00..3f146ab15 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/renderer/pdf/pdfbox/PDFBoxRenderer.java
@@ -50,6 +50,7 @@ import org.apache.tika.metadata.TikaPagedText;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
 import org.apache.tika.renderer.PageBasedRenderResults;
 import org.apache.tika.renderer.PageRangeRequest;
 import org.apache.tika.renderer.RenderRequest;
@@ -61,10 +62,11 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
 
     Set<MediaType> SUPPORTED_TYPES = Collections.singleton(PDFParser.MEDIA_TYPE);
 
-    private static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);
+    protected static final Logger LOG = LoggerFactory.getLogger(PDFBoxRenderer.class);
 
     /**
      * This is the amount of time it takes for PDFBox to render the page
+     * to a BufferedImage
      */
     public static Property PDFBOX_RENDERING_TIME_MS =
             Property.externalReal(Rendering.RENDERING_PREFIX + "pdfbox-rendering-ms");
@@ -82,9 +84,9 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
         return SUPPORTED_TYPES;
     }
 
-    private int dpi = 300;
-    private ImageType imageType = ImageType.GRAY;
-    private String imageFormatName = "tiff";
+    private int defaultDPI = 300;
+    private ImageType defaultImageType = ImageType.GRAY;
+    private String defaultImageFormatName = "png";
 
 
     @Override
@@ -128,7 +130,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
     }
 
     private void renderRange(PDDocument pdDocument, int start, int endInclusive, Metadata metadata,
-                                    ParseContext parseContext, PageBasedRenderResults results) {
+                             ParseContext parseContext, PageBasedRenderResults results) {
         PDFRenderer renderer = new PDFRenderer(pdDocument);
         RenderingTracker tracker = parseContext.get(RenderingTracker.class);
         if (tracker == null) {
@@ -143,7 +145,7 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
             try {
                 m.set(TikaPagedText.PAGE_NUMBER, i);
                 m.set(TikaPagedText.PAGE_ROTATION, (double)pdDocument.getPage(i - 1).getRotation());
-                results.add(renderPage(renderer, id, i, m));
+                results.add(renderPage(renderer, id, i, m, parseContext));
             } catch (IOException e) {
                 EmbeddedDocumentUtil.recordException(e, m);
                 results.add(new RenderResult(RenderResult.STATUS.EXCEPTION, id, null, m));
@@ -151,22 +153,24 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
         }
     }
 
-
     protected RenderResult renderPage(PDFRenderer renderer, int id, int pageNumber,
-                                     Metadata metadata)
+                                      Metadata metadata, ParseContext parseContext)
             throws IOException {
 
         Path tmpFile = Files.createTempFile("tika-pdfbox-rendering-",
-                "-" + id + "-" + pageNumber + "." + imageFormatName);
+                "-" + id + "-" + pageNumber + "." + getImageFormatName(parseContext));
         try {
             long start = System.currentTimeMillis();
             //TODO: parameterize whether or not to un-rotate page?
-            BufferedImage image = renderer.renderImageWithDPI(pageNumber - 1, dpi, imageType);
+            BufferedImage image = renderer.renderImageWithDPI(
+                    pageNumber - 1,
+                    getDPI(parseContext),
+                    getImageType(parseContext));
             long renderingElapsed = System.currentTimeMillis() - start;
             metadata.set(PDFBOX_RENDERING_TIME_MS, renderingElapsed);
             start = System.currentTimeMillis();
             try (OutputStream os = Files.newOutputStream(tmpFile)) {
-                ImageIOUtil.writeImage(image, imageFormatName, os, dpi);
+                ImageIOUtil.writeImage(image, getImageFormatName(parseContext), os, getDPI(parseContext));
             }
             long elapsedWrite = System.currentTimeMillis() - start;
             metadata.set(PDFBOX_IMAGE_WRITING_TIME_MS, elapsedWrite);
@@ -197,15 +201,38 @@ public class PDFBoxRenderer implements PDDocumentRenderer, Initializable {
     }
 
     public void setDPI(int dpi) {
-        this.dpi = dpi;
+        this.defaultDPI = dpi;
     }
 
-
     public void setImageType(ImageType imageType) {
-        this.imageType = imageType;
+        this.defaultImageType = imageType;
     }
 
     public void setImageFormatName(String imageFormatName) {
-        this.imageFormatName = imageFormatName;
+        this.defaultImageFormatName = imageFormatName;
+    }
+
+    protected int getDPI(ParseContext parseContext) {
+        PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+        if (pdfParserConfig == null) {
+            return defaultDPI;
+        }
+        return pdfParserConfig.getOcrDPI();
+    }
+
+    protected ImageType getImageType(ParseContext parseContext) {
+        PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+        if (pdfParserConfig == null) {
+            return defaultImageType;
+        }
+        return pdfParserConfig.getOcrImageType();
+    }
+
+    protected String getImageFormatName(ParseContext parseContext) {
+        PDFParserConfig pdfParserConfig = parseContext.get(PDFParserConfig.class);
+        if (pdfParserConfig == null) {
+            return defaultImageFormatName;
+        }
+        return pdfParserConfig.getOcrImageFormatName();
     }
 }
diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
index 8bfc02023..387b085a0 100644
--- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
+++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/CXFTestBase.java
@@ -21,6 +21,7 @@ import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
+import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
@@ -260,4 +261,54 @@ public abstract class CXFTestBase {
         return tmp;
     }
 
+    protected static AverageColor getAverageColor(BufferedImage image, int minX, int maxX, int minY,
+                                                  int maxY) {
+        long totalRed = 0;
+        long totalGreen = 0;
+        long totalBlue = 0;
+        int pixels = 0;
+        for (int x = minX; x < maxX; x++) {
+            for (int y = minY; y < maxY; y++) {
+                int clr = image.getRGB(x, y);
+                int red = (clr & 0x00ff0000) >> 16;
+                int green = (clr & 0x0000ff00) >> 8;
+                int blue = clr & 0x000000ff;
+                totalRed += red;
+                totalGreen += green;
+                totalBlue += blue;
+                pixels++;
+            }
+        }
+        return new AverageColor((double) totalRed / (double) pixels,
+                (double) totalGreen / (double) pixels, (double) totalBlue / (double) pixels);
+    }
+
+    public static class AverageColor {
+        double red;
+        double green;
+        double blue;
+
+        public AverageColor(double averageRed, double averageGreen, double averageBlue) {
+            this.red = averageRed;
+            this.green = averageGreen;
+            this.blue = averageBlue;
+        }
+
+        public double getRed() {
+            return red;
+        }
+
+        public double getGreen() {
+            return green;
+        }
+
+        public double getBlue() {
+            return blue;
+        }
+
+        @Override
+        public String toString() {
+            return "AverageColor{" + "red=" + red + ", green=" + green + ", blue=" + blue + '}';
+        }
+    }
 }
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
index e7b1bf86c..cea91fb9b 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceTest.java
@@ -266,18 +266,18 @@ public class UnpackerResourceTest extends CXFTestBase {
             //top left
             AverageColor averageColor =
                     getAverageColor(image, 0, image.getWidth() / 5, 0, image.getHeight() / 10);
-            assertTrue(averageColor.red > 250);
-            assertTrue(averageColor.green < 1);
-            assertTrue(averageColor.blue < 1);
+            assertTrue(averageColor.getRed() > 250);
+            assertTrue(averageColor.getGreen() < 1);
+            assertTrue(averageColor.getBlue() < 1);
 
             //bottom left = green
             averageColor = getAverageColor(image, 0, image.getWidth() / 5,
                     image.getHeight() / 2 + image.getHeight() / 10,
                     image.getHeight() / 2 + 2 * image.getHeight() / 10);
 
-            assertTrue(averageColor.red < 1);
-            assertTrue(averageColor.green > 250);
-            assertTrue(averageColor.blue < 1);
+            assertTrue(averageColor.getRed() < 1);
+            assertTrue(averageColor.getGreen() > 250);
+            assertTrue(averageColor.getBlue() < 1);
 
             //bottom right = blue
             averageColor = getAverageColor(image, image.getWidth() / 2 + image.getWidth() / 10,
@@ -285,48 +285,9 @@ public class UnpackerResourceTest extends CXFTestBase {
                     image.getHeight() / 2 + image.getHeight() / 10,
                     image.getHeight() / 2 + 2 * image.getHeight() / 10);
 
-            assertTrue(averageColor.red < 1);
-            assertTrue(averageColor.green < 1);
-            assertTrue(averageColor.blue > 250);
-        }
-    }
-
-    private static AverageColor getAverageColor(BufferedImage image, int minX, int maxX, int minY,
-                                                int maxY) {
-        long totalRed = 0;
-        long totalGreen = 0;
-        long totalBlue = 0;
-        int pixels = 0;
-        for (int x = minX; x < maxX; x++) {
-            for (int y = minY; y < maxY; y++) {
-                int clr = image.getRGB(x, y);
-                int red = (clr & 0x00ff0000) >> 16;
-                int green = (clr & 0x0000ff00) >> 8;
-                int blue = clr & 0x000000ff;
-                totalRed += red;
-                totalGreen += green;
-                totalBlue += blue;
-                pixels++;
-            }
-        }
-        return new AverageColor((double) totalRed / (double) pixels,
-                (double) totalGreen / (double) pixels, (double) totalBlue / (double) pixels);
-    }
-
-    public static class AverageColor {
-        double red;
-        double green;
-        double blue;
-
-        public AverageColor(double averageRed, double averageGreen, double averageBlue) {
-            this.red = averageRed;
-            this.green = averageGreen;
-            this.blue = averageBlue;
-        }
-
-        @Override
-        public String toString() {
-            return "AverageColor{" + "red=" + red + ", green=" + green + ", blue=" + blue + '}';
+            assertTrue(averageColor.getRed() < 1);
+            assertTrue(averageColor.getGreen() < 1);
+            assertTrue(averageColor.getBlue() > 250);
         }
     }
 }
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
new file mode 100644
index 000000000..c43630467
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/UnpackerResourceWithConfigTest.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.standard;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import javax.imageio.ImageIO;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.TikaServerParseExceptionMapper;
+import org.apache.tika.server.core.resource.UnpackerResource;
+import org.apache.tika.server.core.writer.TarWriter;
+import org.apache.tika.server.core.writer.ZipWriter;
+import org.apache.tika.server.standard.config.PDFServerConfig;
+
+public class UnpackerResourceWithConfigTest extends CXFTestBase {
+    private static final String BASE_PATH = "/unpack";
+    private static final String ALL_PATH = BASE_PATH + "/all";
+
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        sf.setResourceClasses(UnpackerResource.class);
+        sf.setResourceProvider(UnpackerResource.class,
+                new SingletonResourceProvider(new UnpackerResource()));
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new TarWriter());
+        providers.add(new ZipWriter());
+        providers.add(new TikaServerParseExceptionMapper(false));
+        sf.setProviders(providers);
+    }
+
+    @Override
+    protected InputStream getTikaConfigInputStream() throws IOException {
+        return this.getClass().getResourceAsStream("/config/tika-config-unpacker.xml");
+    }
+
+    //Test that the PDFParser's renderer can be configured at parse time
+    //when specified in tika-config.xml
+    @Test
+    public void testPDFPerPageRenderColor() throws Exception {
+
+        //default is gray scale png; change to rgb and tiff
+        Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "imageStrategy",
+                        "RenderPagesAtPageEnd")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageType", "rgb")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageFormatName", "tiff")
+                .accept("application/zip").put(ClassLoader.getSystemResourceAsStream(
+                        "test-documents/testColorRendering.pdf"));
+        Map<String, byte[]> results = readZipArchiveBytes((InputStream) response.getEntity());
+        byte[] renderedImage = null;
+        for (Map.Entry<String, byte[]> e : results.entrySet()) {
+            if (e.getKey().startsWith("tika-pdfbox-rendering")) {
+                renderedImage = e.getValue();
+                break;
+            }
+        }
+        assertEquals("image/tiff",
+                TikaConfig.getDefaultConfig().getDetector()
+                        .detect(new ByteArrayInputStream(renderedImage), new Metadata()).toString()
+        );
+
+        try (InputStream is = new ByteArrayInputStream(renderedImage)) {
+            BufferedImage image = ImageIO.read(is);
+            //top left
+            AverageColor averageColor =
+                    getAverageColor(image, 0, image.getWidth() / 5, 0, image.getHeight() / 10);
+            assertTrue(averageColor.getRed() > 250);
+            assertTrue(averageColor.getGreen() < 1);
+            assertTrue(averageColor.getBlue() < 1);
+
+            //bottom left = green
+            averageColor = getAverageColor(image, 0, image.getWidth() / 5,
+                    image.getHeight() / 2 + image.getHeight() / 10,
+                    image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+            assertTrue(averageColor.getRed() < 1);
+            assertTrue(averageColor.getGreen() > 250);
+            assertTrue(averageColor.getBlue() < 1);
+
+            //bottom right = blue
+            averageColor = getAverageColor(image, image.getWidth() / 2 + image.getWidth() / 10,
+                    image.getWidth() / 2 + 2 * image.getWidth() / 10,
+                    image.getHeight() / 2 + image.getHeight() / 10,
+                    image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+            assertTrue(averageColor.getRed() < 1);
+            assertTrue(averageColor.getGreen() < 1);
+            assertTrue(averageColor.getBlue() > 250);
+        }
+    }
+
+    @Test
+    public void testPDFPerPageRenderGray() throws Exception {
+
+
+        Response response = WebClient.create(CXFTestBase.endPoint + ALL_PATH)
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "imageStrategy",
+                        "RenderPagesAtPageEnd")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageType", "gray")
+                .header(PDFServerConfig.X_TIKA_PDF_HEADER_PREFIX + "ocrImageFormatName", "jpeg")
+                .accept("application/zip").put(ClassLoader.getSystemResourceAsStream(
+                        "test-documents/testColorRendering.pdf"));
+        Map<String, byte[]> results = readZipArchiveBytes((InputStream) response.getEntity());
+        byte[] renderedImage = null;
+        for (Map.Entry<String, byte[]> e : results.entrySet()) {
+            if (e.getKey().startsWith("tika-pdfbox-rendering")) {
+                renderedImage = e.getValue();
+                break;
+            }
+        }
+        assertEquals("image/jpeg",
+                TikaConfig.getDefaultConfig().getDetector()
+                        .detect(new ByteArrayInputStream(renderedImage), new Metadata()).toString()
+        );
+
+        try (InputStream is = new ByteArrayInputStream(renderedImage)) {
+            BufferedImage image = ImageIO.read(is);
+            //top left
+            AverageColor averageColor =
+                    getAverageColor(image, 0, image.getWidth() / 5, 0, image.getHeight() / 10);
+
+            assertTrue(averageColor.getRed() > 140 && averageColor.getRed() < 160);
+            assertTrue(averageColor.getGreen() > 140 && averageColor.getGreen() < 160);
+            assertTrue(averageColor.getBlue() > 140 && averageColor.getBlue() < 160);
+
+            //bottom left = green
+            averageColor = getAverageColor(image, 0, image.getWidth() / 5,
+                    image.getHeight() / 2 + image.getHeight() / 10,
+                    image.getHeight() / 2 + 2 * image.getHeight() / 10);
+
+            assertTrue(averageColor.getRed() < 210 && averageColor.getRed() > 190);
+            assertTrue(averageColor.getGreen() < 210 && averageColor.getGreen() > 190);
+            assertTrue(averageColor.getBlue() < 210 && averageColor.getBlue() > 190);
+
+            //bottom right = blue
+            averageColor = getAverageColor(image, image.getWidth() / 2 + image.getWidth() / 10,
+                    image.getWidth() / 2 + 2 * image.getWidth() / 10,
+                    image.getHeight() / 2 + image.getHeight() / 10,
+                    image.getHeight() / 2 + 2 * image.getHeight() / 10);
+            assertTrue(averageColor.getRed() < 100 && averageColor.getRed() > 90);
+            assertTrue(averageColor.getGreen() < 100 && averageColor.getGreen() > 90);
+            assertTrue(averageColor.getBlue() < 100 && averageColor.getBlue() > 90);
+        }
+    }
+
+}
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-unpacker.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-unpacker.xml
new file mode 100644
index 000000000..0e3c103b8
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-unpacker.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+    <renderers>
+        <renderer class="org.apache.tika.renderer.pdf.pdfbox.PDFBoxRenderer"/>
+    </renderers>
+</properties>
\ No newline at end of file