You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/06 18:14:52 UTC
[tika] 05/05: add VectorGraphicsOnlyPDFRenderer
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 99dc4885d9ec670396b932df4ee3a911a9c262f4
Author: tballison <ta...@apache.org>
AuthorDate: Fri May 6 14:14:33 2022 -0400
add VectorGraphicsOnlyPDFRenderer
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 3 +
.../apache/tika/parser/pdf/PDFParserConfig.java | 9 +-
.../parser/pdf/VectorGraphicsOnlyPDFRenderer.java | 133 +++++++++++++++++++++
3 files changed, 144 insertions(+), 1 deletion(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 344756dd0..f473e6f01 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -579,6 +579,9 @@ class AbstractPDF2XHTML extends PDFTextStripper {
case TEXT_ONLY:
renderer = new TextOnlyPDFRenderer(pdDocument);
break;
+ case VECTOR_GRAPHICS_ONLY:
+ renderer = new VectorGraphicsOnlyPDFRenderer(pdDocument);
+ break;
case ALL:
renderer = new PDFRenderer(pdDocument);
break;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index fb8a315ae..12b10e6c1 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -930,7 +930,12 @@ public class PDFParserConfig implements Serializable {
}
public enum OCR_RENDERING_STRATEGY {
- NO_TEXT, TEXT_ONLY, ALL; //AUTO?
+
+ NO_TEXT, //includes vector graphics and image
+ TEXT_ONLY, //renders only glyphs
+ VECTOR_GRAPHICS_ONLY, //renders only vector graphics
+ ALL;
+ //TODO: add AUTO?
private static OCR_RENDERING_STRATEGY parse(String s) {
if (s == null) {
@@ -938,6 +943,8 @@ public class PDFParserConfig implements Serializable {
}
String lc = s.toLowerCase(Locale.US);
switch (lc) {
+ case "vector_graphics_only":
+ return VECTOR_GRAPHICS_ONLY;
case "text_only":
return TEXT_ONLY;
case "no_text":
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java
new file mode 100644
index 000000000..acd4b9485
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/VectorGraphicsOnlyPDFRenderer.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.pdf;
+
+import java.awt.Graphics2D;
+import java.io.IOException;
+
+import org.apache.pdfbox.cos.COSArray;
+import org.apache.pdfbox.cos.COSDictionary;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
+import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.apache.pdfbox.rendering.PageDrawer;
+import org.apache.pdfbox.rendering.PageDrawerParameters;
+import org.apache.pdfbox.util.Matrix;
+import org.apache.pdfbox.util.Vector;
+
+/**
+ * This class extends the PDFRenderer to render only the textual
+ * elements
+ */
+public class VectorGraphicsOnlyPDFRenderer extends PDFRenderer {
+
+ public VectorGraphicsOnlyPDFRenderer(PDDocument document) {
+ super(document);
+ }
+
+ /**
+ * Returns a new PageDrawer instance, using the given parameters. May be overridden.
+ */
+ protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException {
+ PageDrawer pageDrawer = new VectorGraphicsOnlyDrawer(parameters);
+ pageDrawer.setAnnotationFilter(getAnnotationsFilter());
+ return pageDrawer;
+ }
+
+ private class VectorGraphicsOnlyDrawer extends PageDrawer {
+ public VectorGraphicsOnlyDrawer(PageDrawerParameters parameters) throws IOException {
+ super(parameters);
+ }
+
+
+ @Override
+ public void beginText() throws IOException {
+ }
+
+ @Override
+ public void endText() throws IOException {
+ }
+
+ @Override
+ protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font, int code,
+ Vector displacement) throws IOException {
+ }
+
+ @Override
+ protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font font, int code,
+ Vector displacement) throws IOException {
+ }
+
+ @Override
+ public void drawImage(PDImage pdImage) throws IOException {
+ }
+
+ @Override
+ protected void showTransparencyGroupOnGraphics(PDTransparencyGroup form,
+ Graphics2D graphics) throws IOException {
+ }
+
+ @Override
+ public void beginMarkedContentSequence(COSName tag, COSDictionary properties) {
+ }
+
+ @Override
+ public void endMarkedContentSequence() {
+ }
+
+
+ @Override
+ public void showTextString(byte[] string) throws IOException {
+ }
+
+ @Override
+ public void showTextStrings(COSArray array) throws IOException {
+ }
+
+ @Override
+ protected void applyTextAdjustment(float tx, float ty) throws IOException {
+ }
+
+ @Override
+ protected void showText(byte[] string) throws IOException {
+ }
+
+ @Override
+ protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
+ Vector displacement) throws IOException {
+ }
+
+ @Override
+ protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code,
+ Vector displacement) throws IOException {
+ }
+
+ @Override
+ protected void showFontGlyph(Matrix textRenderingMatrix, PDFont font, int code,
+ String unicode, Vector displacement) throws IOException {
+ }
+
+ @Override
+ protected void showType3Glyph(Matrix textRenderingMatrix, PDType3Font font, int code,
+ String unicode, Vector displacement) throws IOException {
+ }
+ }
+}