You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/07/16 19:42:48 UTC
[tika] branch main updated: TIKA-3361 Make ocrStrategy=Auto more
intelligent (#447)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 484a340 TIKA-3361 Make ocrStrategy=Auto more intelligent (#447)
484a340 is described below
commit 484a340a4643ed2335413ba4feddbe8d64f4e9d8
Author: Peter Kronenberg <pa...@gmail.com>
AuthorDate: Fri Jul 16 15:42:38 2021 -0400
TIKA-3361 Make ocrStrategy=Auto more intelligent (#447)
Co-authored-by: Peter Kronenberg <pe...@torch.ai>
---
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 17 +++--
.../java/org/apache/tika/parser/pdf/PDFParser.java | 5 ++
.../apache/tika/parser/pdf/PDFParserConfig.java | 81 ++++++++++++++++++++++
3 files changed, 98 insertions(+), 5 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 79a4160..968e97c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -583,12 +583,19 @@ class AbstractPDF2XHTML extends PDFTextStripper {
}
}
}
- if (config.getOcrStrategy()
- .equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
+ if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION) {
doOCROnCurrentPage(OCR_AND_TEXT_EXTRACTION);
- } else if (config.getOcrStrategy().equals(PDFParserConfig.OCR_STRATEGY.AUTO)) {
- //TODO add more sophistication
- if (totalCharsPerPage < 10 || unmappedUnicodeCharsPerPage > 10) {
+ } else if (config.getOcrStrategy() == PDFParserConfig.OCR_STRATEGY.AUTO) {
+ boolean unmappedExceedsLimit = false;
+ if (totalCharsPerPage > config.getOcrStrategyAuto().getTotalCharsPerPage()) {
+ // There are enough characters to not have to do OCR. Check number of unmapped characters
+ final float percentUnmapped = (float) unmappedUnicodeCharsPerPage / totalCharsPerPage;
+ final float unmappedCharacterLimit = config.getOcrStrategyAuto().getUnmappedUnicodeCharsPerPage();
+ unmappedExceedsLimit = (unmappedCharacterLimit < 1)
+ ? percentUnmapped > unmappedCharacterLimit
+ : unmappedUnicodeCharsPerPage > unmappedCharacterLimit;
+ }
+ if (totalCharsPerPage <= config.getOcrStrategyAuto().getTotalCharsPerPage() || unmappedExceedsLimit) {
doOCROnCurrentPage(AUTO);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
index 0351da1..0300705 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParser.java
@@ -478,6 +478,11 @@ public class PDFParser extends AbstractParser implements Initializable {
}
@Field
+ public void setOcrStrategyAuto(String ocrStrategyAuto) {
+ defaultConfig.setOcrStrategyAuto(ocrStrategyAuto);
+ }
+
+ @Field
public void setOcrRenderingStrategy(String ocrRenderingStrategy) {
defaultConfig.setOcrRenderingStrategy(ocrRenderingStrategy);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
index b665e4c..c74281b 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/PDFParserConfig.java
@@ -22,6 +22,8 @@ import java.lang.reflect.Modifier;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.text.PDFTextStripper;
@@ -96,6 +98,13 @@ public class PDFParserConfig implements Serializable {
private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO;
+ // If OCR_Strategy=AUTO, then this controls the algorithm used
+ private static final OCRStrategyAuto OCR_STRATEGY_AUTO_BETTER = new OCRStrategyAuto(10, 10);
+ private static final OCRStrategyAuto OCR_STRATEGY_AUTO_FASTER = new OCRStrategyAuto(.1f, 10);
+ private static final int OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE = 10;
+
+ private OCRStrategyAuto ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER;
+
private OCR_RENDERING_STRATEGY ocrRenderingStrategy = OCR_RENDERING_STRATEGY.NO_TEXT;
private int ocrDPI = 300;
@@ -485,6 +494,13 @@ public class PDFParserConfig implements Serializable {
}
/**
+ * @return ocr auto strategy to use when ocr_strategy = Auto
+ */
+ public OCRStrategyAuto getOcrStrategyAuto() {
+ return ocrStrategyAuto;
+ }
+
+ /**
* Which strategy to use for OCR
*
* @param ocrStrategy
@@ -494,6 +510,41 @@ public class PDFParserConfig implements Serializable {
userConfigured.add("ocrStrategy");
}
+
+ public void setOcrStrategyAuto(String ocrStrategyAuto) {
+ final String regex = "^\\s*(faster|better)|(\\d{1,3})(%)?(?:,\\s*(\\d{1,3}))?\\s*$";
+ Pattern pattern = Pattern.compile(regex);
+ Matcher matcher = pattern.matcher(ocrStrategyAuto);
+ if (matcher.matches()) {
+ final String group1 = matcher.group(1);
+
+ if ("better".equals(group1)) {
+ this.ocrStrategyAuto = OCR_STRATEGY_AUTO_BETTER;
+ } else if ("faster".equals(group1)) {
+ this.ocrStrategyAuto = OCR_STRATEGY_AUTO_FASTER;
+ } else {
+ float unmappedUnicodeCharsPerPage = Integer.parseInt(matcher.group(2));
+ if (matcher.group(3) != null) {
+ // If we have the percent sign, then convert
+ if (unmappedUnicodeCharsPerPage > 100.0) {
+ throw new IllegalArgumentException
+ ("Error parsing OCRStrategyAuto - Percent cannot exceed 100%");
+ }
+ unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage / 100f;
+ }
+ // The 2nd number is optional. Default to 10 chars per page
+ int totalCharsPerPage = matcher.group(4) == null
+ ? OCR_STRATEGY_AUTO_DEFAULT_CHARS_PER_PAGE
+ : Integer.parseInt(matcher.group(4));
+ this.ocrStrategyAuto = new OCRStrategyAuto(unmappedUnicodeCharsPerPage, totalCharsPerPage);
+ }
+ userConfigured.add("ocrStrategyAuto");
+
+ } else {
+ throw new IllegalArgumentException("Error parsing OCRStrategyAuto - Must be in the form 'num[%], num'");
+ }
+ }
+
/**
* Which strategy to use for OCR
*
@@ -878,6 +929,36 @@ public class PDFParserConfig implements Serializable {
}
}
+ /**
+ * Encapsulate the numbers used to control OCR Strategy when set to auto
+ * <p>
+ * If the total characters on the page < this.totalCharsPerPage
+ * or
+ * total unmapped unicode characters on the page > this.unmappedUnicodeCharsPerPage
+ * then we will perform OCR on the page
+ * <p>
+ * If unamppedUnicodeCharsPerPage is an integer > 0, then we compare absolute number of characters.
+ * If it is a float < 1, then we assume it is a percentage and we compare it to the
+ * percentage of unmappedCharactersPerPage/totalCharsPerPage
+ */
+ public static class OCRStrategyAuto implements Serializable {
+ private final float unmappedUnicodeCharsPerPage;
+ private final int totalCharsPerPage;
+
+ public OCRStrategyAuto(float unmappedUnicodeCharsPerPage, int totalCharsPerPage) {
+ this.totalCharsPerPage = totalCharsPerPage;
+ this.unmappedUnicodeCharsPerPage = unmappedUnicodeCharsPerPage;
+ }
+
+ public float getUnmappedUnicodeCharsPerPage() {
+ return unmappedUnicodeCharsPerPage;
+ }
+
+ public int getTotalCharsPerPage() {
+ return totalCharsPerPage;
+ }
+ }
+
public enum OCR_RENDERING_STRATEGY {
NO_TEXT, ALL; //AUTO?
// Would TEXT_ONLY be useful in instances where the unicode mappings