You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/06/09 15:37:35 UTC

[tika] 03/04: TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on crash.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d7fa2cd284a0d400a1ef29f7111018bb16b1cc5d
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 9 11:37:02 2021 -0400

    TIKA-3441 -- improve likelihood that tesseract processes will be shutdown on crash.
---
 .../apache/tika/parser/ocr/TesseractOCRParser.java | 91 ++++++++++++----------
 1 file changed, 52 insertions(+), 39 deletions(-)

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
index b2c4496..fa52248 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java
@@ -530,37 +530,52 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
         
         ProcessBuilder pb = new ProcessBuilder(cmd);
         setEnv(config, pb);
-        final Process process = pb.start();
+        Process process = null;
+        try {
+            process = pb.start();
+            runOCRProcess(process, config.getTimeout());
+        } finally {
+            if (process != null) {
+                process.destroyForcibly();
+            }
+        }
+    }
 
+    private void runOCRProcess(Process process, int timeout) throws IOException, TikaException {
         process.getOutputStream().close();
         InputStream out = process.getInputStream();
         InputStream err = process.getErrorStream();
-
-        logStream("OCR MSG", out, input);
-        logStream("OCR ERROR", err, input);
-
-        FutureTask<Integer> waitTask = new FutureTask<>(new Callable<Integer>() {
-            public Integer call() throws Exception {
-                return process.waitFor();
-            }
-        });
-
-        Thread waitThread = new Thread(waitTask);
-        waitThread.start();
-
+        StringBuilder outBuilder = new StringBuilder();
+        StringBuilder errBuilder = new StringBuilder();
+        Thread outThread = logStream(out, outBuilder);
+        Thread errThread = logStream(err, errBuilder);
+        outThread.start();
+        errThread.start();
+
+        int exitValue = Integer.MIN_VALUE;
         try {
-            waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+            boolean finished = process.waitFor(timeout, TimeUnit.SECONDS);
+            if (!finished) {
+                throw new TikaException("TesseractOCRParser timeout");
+            }
+            exitValue = process.exitValue();
         } catch (InterruptedException e) {
-            waitThread.interrupt();
-            process.destroy();
             Thread.currentThread().interrupt();
             throw new TikaException("TesseractOCRParser interrupted", e);
-        } catch (ExecutionException e) {
-            // should not be thrown
-        } catch (TimeoutException e) {
-            waitThread.interrupt();
-            process.destroy();
-            throw new TikaException("TesseractOCRParser timeout", e);
+        } catch (IllegalThreadStateException e) {
+            //this _should_ never be thrown
+            throw new TikaException("TesseractOCRParser timeout");
+        }
+        if (exitValue > 0) {
+            try {
+                //make sure this thread is actually done
+                errThread.join(1000);
+            } catch (InterruptedException e) {
+                //swallow
+            }
+            throw new TikaException(
+                    "TesseractOCRParser bad exit value " + exitValue + " err msg: " +
+                            errBuilder.toString());
         }
     }
 
@@ -607,24 +622,22 @@ public class TesseractOCRParser extends AbstractParser implements Initializable
      * stream of the given process to not block the process. The stream is closed
      * once fully processed.
      */
-    private void logStream(final String logType, final InputStream stream, final File file) {
-        new Thread() {
-            public void run() {
-                Reader reader = new InputStreamReader(stream, UTF_8);
-                StringBuilder out = new StringBuilder();
-                char[] buffer = new char[1024];
-                try {
-                    for (int n = reader.read(buffer); n != -1; n = reader.read(buffer))
-                        out.append(buffer, 0, n);
-                } catch (IOException e) {
-
-                } finally {
-                    IOUtils.closeQuietly(stream);
+    private Thread logStream(final InputStream stream, final StringBuilder out) {
+        return new Thread(() -> {
+            Reader reader = new InputStreamReader(stream, UTF_8);
+            char[] buffer = new char[1024];
+            try {
+                for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                    out.append(buffer, 0, n);
                 }
-
-                LOG.debug("{}", out);
+            } catch (IOException e) {
+                //swallow
+            } finally {
+                IOUtils.closeQuietly(stream);
             }
-        }.start();
+
+            LOG.debug("{}", out);
+        });
     }
 
     static String getTesseractProg() {