You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/16 19:42:48 UTC

[tika] branch main updated: TIKA-3361 Make ocrStrategy=Auto more intelligent (#447)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 484a340  TIKA-3361 Make ocrStrategy=Auto more intelligent (#447)
484a340 is described below

commit 484a340a4643ed2335413ba4feddbe8d64f4e9d8
Author: Peter Kronenberg <pa...@gmail.com>
AuthorDate: Fri Jul 16 15:42:38 2021 -0400

    TIKA-3361 Make ocrStrategy=Auto more intelligent (#447)
    
    Co-authored-by: Peter Kronenberg <pe...@torch.ai>
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 17 +++--
 .../java/org/apache/tika/parser/pdf/PDFParser.java |  5 ++
 .../apache/tika/parser/pdf/PDFParserConfig.java    | 81 ++++++++++++++++++++++
 3 files changed, 98 insertions(+), 5 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 79a4160..968e97c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -583,12 +583,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
                     }
                 }
             }
-            if (config.getOcrStrategy()
-                    .equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
+            if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
                 doOCROnCurrentPage(OCR_AND_TEXT_EXTRACTION);
-            } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) {
-                //TODO add more sophistication
-                if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) {
+            } else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) {
+                boolean unmappedExceedsLimit = false;
+                if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
+                    // There are enough characters to not have to do OCR.  Check number of unmapped characters
+                    final float percentUnmapped = (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
+                    final float unmappedCharacterLimit = config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
+                    unmappedExceedsLimit = (unmappedCharacterLimit < 1)
+                            ? percentUnmapped > unmappedCharacterLimit
+                            : unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
+                }
+                if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() || unmappedExceedsLimit) {
                     doOCROnCurrentPage(AUTO);
                 }
             }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 0351da1..0300705 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -478,6 +478,11 @@ public class PDFParser extends AbstractParser implements Initializable {
     }
 
     @Field
+    public void setOcrStrategyAuto(String ocrStrategyAuto) {
+        defaultConfig.setOcrStrategyAuto(ocrStrategyAuto);
+    }
+
+    @Field
     public void setOcrRenderingStrategy(String ocrRenderingStrategy) {
         defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy);
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b665e4c..c74281b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -22,6 +22,8 @@ import java.lang.reflect.Modifier;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.text.PDFTextStripper;
@@ -96,6 +98,13 @@ public class PDFParserConfig implements Serializable {
 
     private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO;
 
+    // If OCR_Strategy=AUTO, then this controls the algorithm used
+    private static final OCRStrategyAuto OCR_STRATEGY_AUTO_BETTER = new OCRStrategyAuto(10, 10);
+    private static final OCRStrategyAuto OCR_STRATEGY_AUTO_FASTER = new OCRStrategyAuto(.1f, 10);
+    private static final int OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE = 10;
+
+    private OCRStrategyAuto ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER;
+
     private OCR_RENDERING_STRATEGY ocrRenderingStrategy = OCR_RENDERING_STRATEGY.NO_TEXT;
 
     private int ocrDPI = 300;
@@ -485,6 +494,13 @@ public class PDFParserConfig implements Serializable {
     }
 
     /**
+     * @return ocr auto strategy to use when ocr_strategy = Auto
+     */
+    public OCRStrategyAuto getOcrStrategyAuto() {
+        return ocrStrategyAuto;
+    }
+
+    /**
      * Which strategy to use for OCR
      *
      * @param ocrStrategy
@@ -494,6 +510,41 @@ public class PDFParserConfig implements Serializable {
         userConfigured.add("ocrStrategy");
     }
 
+
+    public void setOcrStrategyAuto(String ocrStrategyAuto) {
+        final String regex = "^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$";
+        Pattern pattern = Pattern.compile(regex);
+        Matcher matcher = pattern.matcher(ocrStrategyAuto);
+        if (matcher.matches()) {
+            final String group1 = matcher.group(1);
+
+            if ("better".equals(group1)) {
+                this.ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER;
+            } else if ("faster".equals(group1)) {
+                this.ocrStrategyAuto = OCR_STRATEGY_AUTO_FASTER;
+            } else {
+                float unmappedUnicodeCharsPerPage = Integer.parseInt(matcher.group(2));
+                if (matcher.group(3) != null) {
+                    // If we have the percent sign, then convert
+                    if (unmappedUnicodeCharsPerPage > 100.0) {
+                        throw new IllegalArgumentException
+                        ("Error parsing OCRStrategyAuto - Percent cannot exceed 100%");
+                    }
+                    unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage / 100f;
+                }
+                // The 2nd number is optional.  Default to 10 chars per page
+                int totalCharsPerPage = matcher.group(4) == null
+                        ? OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE
+                        : Integer.parseInt(matcher.group(4));
+                this.ocrStrategyAuto = new OCRStrategyAuto(unmappedUnicodeCharsPerPage, totalCharsPerPage);
+            }
+            userConfigured.add("ocrStrategyAuto");
+
+        } else {
+            throw new IllegalArgumentException("Error parsing OCRStrategyAuto - Must be in the form 'num[%], num'");
+        }
+    }
+
     /**
      * Which strategy to use for OCR
      *
@@ -878,6 +929,36 @@ public class PDFParserConfig implements Serializable {
         }
     }
 
+    /**
+     * Encapsulate the numbers used to control OCR Strategy when set to auto
+     * <p>
+     * If the total characters on the page < this.totalCharsPerPage
+     * or
+     * total unmapped unicode characters on the page > this.unmappedUnicodeCharsPerPage
+     * then we will perform OCR on the page
+     * <p>
+     * If unamppedUnicodeCharsPerPage is an integer > 0, then we compare absolute number of characters.
+     * If it is a float < 1, then we assume it is a percentage and we compare it to the
+     * percentage of unmappedCharactersPerPage/totalCharsPerPage
+     */
+    public static class OCRStrategyAuto implements Serializable {
+        private final float unmappedUnicodeCharsPerPage;
+        private final int totalCharsPerPage;
+
+        public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int totalCharsPerPage) {
+            this.totalCharsPerPage = totalCharsPerPage;
+            this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
+        }
+
+        public float getUnmappedUnicodeCharsPerPage() {
+            return unmappedUnicodeCharsPerPage;
+        }
+
+        public int getTotalCharsPerPage() {
+            return totalCharsPerPage;
+        }
+    }
+
     public enum OCR_RENDERING_STRATEGY {
         NO_TEXT, ALL; //AUTO?
         // Would TEXT_ONLY be useful in instances where the unicode mappings