You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/06 18:14:52 UTC

[tika] 05/05: add VectorGraphicsOnlyPDFRenderer

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 99dc4885d9ec670396b932df4ee3a911a9c262f4
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 6 14:14:33 2022 -0400

    add VectorGraphicsOnlyPDFRenderer
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |   3 +
 .../apache/tika/parser/pdf/PDFParserConfig.java    |   9 +-
 .../parser/pdf/VectorGraphicsOnlyPDFRenderer.java  | 133 +++++++++++++++++++++
 3 files changed, 144 insertions(+), 1 deletion(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 344756dd0..f473e6f01 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -579,6 +579,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             case TEXT_ONLY:
                 renderer = new TextOnlyPDFRenderer(pdDocument);
                 break;
+            case VECTOR_GRAPHICS_ONLY:
+                renderer = new VectorGraphicsOnlyPDFRenderer(pdDocument);
+                break;
             case ALL:
                 renderer = new PDFRenderer(pdDocument);
                 break;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index fb8a315ae..12b10e6c1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -930,7 +930,12 @@ public class PDFParserConfig implements Serializable {
     }
 
     public enum OCR_RENDERING_STRATEGY {
-        NO_TEXT, TEXT_ONLY, ALL; //AUTO?
+
+        NO_TEXT, //includes vector graphics and image
+        TEXT_ONLY, //renders only glyphs
+        VECTOR_GRAPHICS_ONLY, //renders only vector graphics
+        ALL;
+        //TODO: add AUTO?
 
         private static OCR_RENDERING_STRATEGY parse(String s) {
             if (s == null) {
@@ -938,6 +943,8 @@ public class PDFParserConfig implements Serializable {
             }
             String lc = s.toLowerCase(Locale.US);
             switch (lc) {
+                case "vector_graphics_only":
+                    return VECTOR_GRAPHICS_ONLY;
                 case "text_only":
                     return TEXT_ONLY;
                 case "no_text":
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java
new file mode 100644
index 000000000..acd4b9485
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.Graphics2D;
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.rendering.PageDrawer;
+import org.apache.pdfbox.rendering.PageDrawerParameters;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+
+/**
+ * This class extends the PDFRenderer to render only the textual
+ * elements
+ */
+public class VectorGraphicsOnlyPDFRenderer extends PDFRenderer {
+
+    public VectorGraphicsOnlyPDFRenderer(PDDocument document) {
+        super(document);
+    }
+
+    /**
+     * Returns a new PageDrawer instance, using the given parameters. May be overridden.
+     */
+    protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException {
+        PageDrawer pageDrawer = new VectorGraphicsOnlyDrawer(parameters);
+        pageDrawer.setAnnotationFilter(getAnnotationsFilter());
+        return pageDrawer;
+    }
+
+    private class VectorGraphicsOnlyDrawer extends PageDrawer {
+        public VectorGraphicsOnlyDrawer(PageDrawerParameters parameters) throws IOException {
+            super(parameters);
+        }
+
+
+        @Override
+        public void beginText() throws IOException {
+        }
+
+        @Override
+        public void endText() throws IOException {
+        }
+
+        @Override
+        protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font, int code,
+                                     Vector displacement) throws IOException {
+        }
+
+        @Override
+        protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font font, int code,
+                                      Vector displacement) throws IOException {
+        }
+
+        @Override
+        public void drawImage(PDImage pdImage) throws IOException {
+        }
+
+        @Override
+        protected void showTransparencyGroupOnGraphics(PDTransparencyGroup form,
+                                                       Graphics2D graphics) throws IOException {
+        }
+
+        @Override
+        public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
+        }
+
+        @Override
+        public void endMarkedContentSequence() {
+        }
+
+
+        @Override
+        public void showTextString(byte[] string) throws IOException {
+        }
+
+        @Override
+        public void showTextStrings(COSArray array) throws IOException {
+        }
+
+        @Override
+        protected void applyTextAdjustment(float tx, float ty) throws IOException {
+        }
+
+        @Override
+        protected void showText(byte[] string) throws IOException {
+        }
+
+        @Override
+        protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
+                                 Vector displacement) throws IOException {
+        }
+
+        @Override
+        protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,
+                                 Vector displacement) throws IOException {
+        }
+
+        @Override
+        protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font, int code,
+                                     String unicode, Vector displacement) throws IOException {
+        }
+
+        @Override
+        protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font font, int code,
+                                      String unicode, Vector displacement) throws IOException {
+        }
+    }
+}