You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/03/29 19:15:21 UTC
[tika] 01/04: Fix for TIKA-2582 contributed by ewanmellor.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d1526d053f91497ac7bcd4509f1555f4347377d6
Author: Ewan Mellor <co...@ewanmellor.org>
AuthorDate: Wed Feb 21 13:09:53 2018 -0800
Fix for TIKA-2582 contributed by ewanmellor.
Tesseract 4.0 includes a change to use form feed characters to separate
pages by default in its text output. Previous versions used no separator
unless you specified the include_page_breaks option.
This confuses any parser that is not expecting the FF.
ODFParserTest.testOO2Metadata fails, because it is expecting the output of
a blank image to be the empty string, but now the FF is there.
I haven't seen any other failures, but I expect that user code will now see
either FF or U+FFFD where they are not expecting it (SafeContentHandler
replaces the FF with U+FFFD when converting to text to XML).
Fix this by setting Tesseract's page_separator option to the empty string.
This will preserve the no-page-breaks behavior with both Tesseract 3.x and
4.0.
Also, add an option TesseractOCRConfig.pageSeparator so that user code can
request the FF or any other separator, if they want it.
---
.../apache/tika/parser/ocr/TesseractOCRConfig.java | 22 ++++++++++++++++++++++
.../apache/tika/parser/ocr/TesseractOCRParser.java | 1 +
2 files changed, 23 insertions(+)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
index afe0a21..4139cd2 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRConfig.java
@@ -91,6 +91,9 @@ public class TesseractOCRConfig implements Serializable {
// factor by which image is to be scaled.
private int resize = 900;
+ // See setPageSeparator.
+ private String pageSeparator = "";
+
// whether or not to preserve interword spacing
private boolean preserveInterwordSpacing = false;
@@ -256,6 +259,25 @@ public class TesseractOCRConfig implements Serializable {
}
/**
+ * @see #setPageSeparator(String pageSeparator)
+ */
+ public String getPageSeparator() {
+ return pageSeparator;
+ }
+
+ /**
+ * The page separator to use in plain text output. This corresponds to Tesseract's page_separator config option.
+ * The default here is the empty string (i.e. no page separators). Note that this is also the default in
+ * Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character. We are overriding
+ * Tesseract 4.0's default here.
+ *
+ * @param pageSeparator
+ */
+ public void setPageSeparator(String pageSeparator) {
+ this.pageSeparator = pageSeparator;
+ }
+
+ /**
* Whether or not to maintain interword spacing. Default is <code>false</code>.
*
* @param preserveInterwordSpacing
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 08847fd..3e15c44 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -468,6 +468,7 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
String[] cmd = { config.getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
config.getLanguage(), "-psm", config.getPageSegMode(),
config.getOutputType().name().toLowerCase(Locale.US),
+ "-c", "page_separator=" + config.getPageSeparator(),
"-c",
(config.getPreserveInterwordSpacing())? "preserve_interword_spaces=1" : "preserve_interword_spaces=0"};
ProcessBuilder pb = new ProcessBuilder(cmd);
--
To stop receiving notification emails like this one, please contact
tallison@apache.org.