You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/09 15:37:35 UTC
[tika] 03/04: TIKA-3441 -- improve likelihood that tesseract
processes will be shutdown on crash.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit d7fa2cd284a0d400a1ef29f7111018bb16b1cc5d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 9 11:37:02 2021 -0400
TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on crash.
---
.../apache/tika/parser/ocr/TesseractOCRParser.java | 91 ++++++++++++----------
1 file changed, 52 insertions(+), 39 deletions(-)
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index b2c4496..fa52248 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -530,37 +530,52 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
ProcessBuilder pb = new ProcessBuilder(cmd);
setEnv(config, pb);
- final Process process = pb.start();
+ Process process = null;
+ try {
+ process = pb.start();
+ runOCRProcess(process, config.getTimeout());
+ } finally {
+ if (process != null) {
+ process.destroyForcibly();
+ }
+ }
+ }
+ private void runOCRProcess(Process process, int timeout) throws IOException, TikaException {
process.getOutputStream().close();
InputStream out = process.getInputStream();
InputStream err = process.getErrorStream();
-
- logStream("OCR MSG", out, input);
- logStream("OCR ERROR", err, input);
-
- FutureTask<Integer> waitTask = new FutureTask<>(new Callable<Integer>() {
- public Integer call() throws Exception {
- return process.waitFor();
- }
- });
-
- Thread waitThread = new Thread(waitTask);
- waitThread.start();
-
+ StringBuilder outBuilder = new StringBuilder();
+ StringBuilder errBuilder = new StringBuilder();
+ Thread outThread = logStream(out, outBuilder);
+ Thread errThread = logStream(err, errBuilder);
+ outThread.start();
+ errThread.start();
+
+ int exitValue = Integer.MIN_VALUE;
try {
- waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+ boolean finished = process.waitFor(timeout, TimeUnit.SECONDS);
+ if (!finished) {
+ throw new TikaException("TesseractOCRParser timeout");
+ }
+ exitValue = process.exitValue();
} catch (InterruptedException e) {
- waitThread.interrupt();
- process.destroy();
Thread.currentThread().interrupt();
throw new TikaException("TesseractOCRParser interrupted", e);
- } catch (ExecutionException e) {
- // should not be thrown
- } catch (TimeoutException e) {
- waitThread.interrupt();
- process.destroy();
- throw new TikaException("TesseractOCRParser timeout", e);
+ } catch (IllegalThreadStateException e) {
+ //this _should_ never be thrown
+ throw new TikaException("TesseractOCRParser timeout");
+ }
+ if (exitValue > 0) {
+ try {
+ //make sure this thread is actually done
+ errThread.join(1000);
+ } catch (InterruptedException e) {
+ //swallow
+ }
+ throw new TikaException(
+ "TesseractOCRParser bad exit value " + exitValue + " err msg: " +
+ errBuilder.toString());
}
}
@@ -607,24 +622,22 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
* stream of the given process to not block the process. The stream is closed
* once fully processed.
*/
- private void logStream(final String logType, final InputStream stream, final File file) {
- new Thread() {
- public void run() {
- Reader reader = new InputStreamReader(stream, UTF_8);
- StringBuilder out = new StringBuilder();
- char[] buffer = new char[1024];
- try {
- for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
- out.append(buffer, 0, n);
- } catch (IOException e) {
-
- } finally {
- IOUtils.closeQuietly(stream);
+ private Thread logStream(final InputStream stream, final StringBuilder out) {
+ return new Thread(() -> {
+ Reader reader = new InputStreamReader(stream, UTF_8);
+ char[] buffer = new char[1024];
+ try {
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ out.append(buffer, 0, n);
}
-
- LOG.debug("{}", out);
+ } catch (IOException e) {
+ //swallow
+ } finally {
+ IOUtils.closeQuietly(stream);
}
- }.start();
+
+ LOG.debug("{}", out);
+ });
}
static String getTesseractProg() {