You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/16 20:08:22 UTC
[tika] branch main updated: TIKA-3298 -- allow empty string
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 0fac4f8 TIKA-3298 -- allow empty string
0fac4f8 is described below
commit 0fac4f8efdc9964db3780b6fcfc1010483554cb7
Author: tballison <ta...@apache.org>
AuthorDate: Tue Feb 16 15:08:11 2021 -0500
TIKA-3298 -- allow empty string
---
.../main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java | 9 ++++++---
.../java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java | 3 ++-
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index f49254d..3705f99 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -324,11 +324,14 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
*/
private void doOCR(File input, File output, TesseractOCRConfig config) throws IOException, TikaException {
-
ArrayList<String> cmd = new ArrayList<>(Arrays.asList(
- getTesseractPath().toString() + getTesseractProg(), input.getPath(), output.getPath(), "-l",
- config.getLanguage(), "--psm", config.getPageSegMode()
+ getTesseractPath() + getTesseractProg(), input.getPath(), output.getPath(),
+ "--psm", config.getPageSegMode()
));
+ if (! StringUtils.isBlank(config.getLanguage())) {
+ cmd.add("-l");
+ cmd.add(config.getLanguage());
+ }
for (Map.Entry<String, String> entry : config.getOtherTesseractConfig().entrySet()) {
cmd.add("-c");
cmd.add(entry.getKey() + "=" + entry.getValue());
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
index 02fc149..3aa87d2 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRConfigTest.java
@@ -105,7 +105,8 @@ public class TesseractOCRConfigTest extends TikaTest {
@Test
public void testValidateInvalidLanguage() {
List<String> invalidLanguages = Arrays.asList(
- "", "+", "en", "en+", "eng+fra+", "Arabic", "/script/Arabic", "rm -rf *");
+ //"", allow empty string
+ "+", "en", "en+", "eng+fra+", "Arabic", "/script/Arabic", "rm -rf *");
TesseractOCRConfig config = new TesseractOCRConfig();