You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/02/03 18:19:38 UTC
[tika] branch main updated: TIKA-3286 -- throw TikaException on
tesseract exitValue > 0
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f51e9ba TIKA-3286 -- throw TikaException on tesseract exitValue > 0
f51e9ba is described below
commit f51e9ba3a95edd16246b0a5b69c0a130683cb1ae
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 3 13:19:18 2021 -0500
TIKA-3286 -- throw TikaException on tesseract exitValue > 0
---
.../apache/tika/parser/ocr/TesseractOCRParser.java | 24 +++++++++++++++-------
.../tika/parser/ocr/TesseractOCRParserTest.java | 19 +++++++++++++++++
2 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index 04d22c6..5a8b0e9 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -67,9 +67,12 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.tika.sax.XHTMLContentHandler.XHTML;
@@ -324,9 +327,10 @@ public class TesseractOCRParser extends AbstractParser {
process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
-
- logStream("OCR MSG", out, input);
- logStream("OCR ERROR", err, input);
+ StringBuilder outBuilder = new StringBuilder();
+ StringBuilder errBuilder = new StringBuilder();
+ logStream("OCR MSG", out, input, outBuilder);
+ logStream("OCR ERROR", err, input, errBuilder);
FutureTask<Integer> waitTask = new FutureTask<>(new Callable<Integer>() {
public Integer call() throws Exception {
@@ -337,8 +341,9 @@ public class TesseractOCRParser extends AbstractParser {
Thread waitThread = new Thread(waitTask);
waitThread.start();
+ int exitValue = Integer.MIN_VALUE;
try {
- waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+ exitValue = waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
} catch (InterruptedException e) {
waitThread.interrupt();
process.destroy();
@@ -351,6 +356,10 @@ public class TesseractOCRParser extends AbstractParser {
process.destroy();
throw new TikaException("TesseractOCRParser timeout", e);
}
+ if (exitValue > 0) {
+ throw new TikaException("TesseractOCRParser bad exit value " +
+ exitValue + " err msg: "+errBuilder.toString());
+ }
}
/**
@@ -402,15 +411,16 @@ public class TesseractOCRParser extends AbstractParser {
* stream of the given process to not block the process. The stream is closed
* once fully processed.
*/
- private void logStream(final String logType, final InputStream stream, final File file) {
+ private void logStream(final String logType, final InputStream stream,
+ final File file, final StringBuilder out) {
new Thread() {
public void run() {
Reader reader = new InputStreamReader(stream, UTF_8);
- StringBuilder out = new StringBuilder();
char[] buffer = new char[1024];
try {
- for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
out.append(buffer, 0, n);
+ }
} catch (IOException e) {
} finally {
diff --git a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index c120467..64c8453 100644
--- a/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-classic/tika-parsers-classic-modules/tika-parser-ocr-module/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.ocr;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -92,6 +93,24 @@ public class TesseractOCRParserTest extends TikaTest {
assertTrue(m.find());
}
+ @Test (expected = TikaException.class)
+ public void testBadLanguageCode() throws Exception {
+ assumeTrue("can run OCR", canRun());
+
+ TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
+ tesseractOCRConfigconfig.setLanguage("kerplekistanese");
+ ParseContext parseContext = new ParseContext();
+ parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
+
+ //with preserve interwordspacing "on"
+ //allow some flexibility in case Tesseract is computing spaces
+ //somewhat differently in different versions/OS's, etc.
+ String xml = getXML("testOCR_spacing.png",
+ getMetadata(MediaType.image("png")),
+ parseContext).xml;
+ System.out.println(xml);
+ }
+
private Metadata getMetadata(MediaType mediaType) {
Metadata metadata = new Metadata();
MediaType ocrMediaType = new MediaType(mediaType.getType(),