You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/10/07 19:15:03 UTC
[tika] 01/02: WIP do not commit
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3059bed865e29ce5705d082255ed71034d6ca422
Author: tallison <ta...@apache.org>
AuthorDate: Wed Sep 15 09:38:20 2021 -0400
WIP do not commit
---
.../tika/parser/external/ExternalParser.java | 2 +
.../org/apache/tika/utils/FileProcessResult.java | 101 +++++++++++++
.../java/org/apache/tika/utils/ProcessUtils.java | 159 +++++++++++++++++++++
.../java/org/apache/tika/utils/StreamGobbler.java | 81 +++++++++++
.../java/org/apache/tika/utils/StringUtils.java | 18 +++
5 files changed, 361 insertions(+)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
index 32a179a..9a50c6f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
@@ -458,4 +458,6 @@ public class ExternalParser extends AbstractParser {
*/
void consume(String line);
}
+
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java
new file mode 100644
index 0000000..f15c080
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java
@@ -0,0 +1,101 @@
+package org.apache.tika.utils;
+
+public class FileProcessResult {
+
+ String stderr = "";
+ String stdout = "";
+ int exitValue = -1;
+ long processTimeMillis = -1;
+ boolean isTimeout = false;
+ long stdoutLength = -1;
+ long stderrLength = -1;
+ boolean stderrTruncated = false;
+ boolean stdoutTruncated = false;
+
+ public String getStderr() {
+ return stderr;
+ }
+
+ public String getStdout() {
+ return stdout;
+ }
+
+ public int getExitValue() {
+ return exitValue;
+ }
+
+ public long getProcessTimeMillis() {
+ return processTimeMillis;
+ }
+
+ public boolean isTimeout() {
+ return isTimeout;
+ }
+
+ public long getStdoutLength() {
+ return stdoutLength;
+ }
+
+ public long getStderrLength() {
+ return stderrLength;
+ }
+
+ public boolean isStderrTruncated() {
+ return stderrTruncated;
+ }
+
+ public boolean isStdoutTruncated() {
+ return stdoutTruncated;
+ }
+
+ public void setStderr(String stderr) {
+ this.stderr = stderr;
+ }
+
+ public void setStdout(String stdout) {
+ this.stdout = stdout;
+ }
+
+ public void setExitValue(int exitValue) {
+ this.exitValue = exitValue;
+ }
+
+ public void setProcessTimeMillis(long processTimeMillis) {
+ this.processTimeMillis = processTimeMillis;
+ }
+
+ public void setTimeout(boolean timeout) {
+ isTimeout = timeout;
+ }
+
+ public void setStdoutLength(long stdoutLength) {
+ this.stdoutLength = stdoutLength;
+ }
+
+ public void setStderrLength(long stderrLength) {
+ this.stderrLength = stderrLength;
+ }
+
+ public void setStderrTruncated(boolean stderrTruncated) {
+ this.stderrTruncated = stderrTruncated;
+ }
+
+ public void setStdoutTruncated(boolean stdoutTruncated) {
+ this.stdoutTruncated = stdoutTruncated;
+ }
+
+ @Override
+ public String toString() {
+ return "FileProcessResult{" +
+ "stderr='" + stderr + '\'' +
+ ", stdout='" + stdout + '\'' +
+ ", exitValue=" + exitValue +
+ ", processTimeMillis=" + processTimeMillis +
+ ", isTimeout=" + isTimeout +
+ ", stdoutLength=" + stdoutLength +
+ ", stderrLength=" + stderrLength +
+ ", stderrTruncated=" + stderrTruncated +
+ ", stdoutTruncated=" + stdoutTruncated +
+ '}';
+ }
+}
\ No newline at end of file
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
index daa92ac..fbe1799 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
@@ -17,8 +17,34 @@
package org.apache.tika.utils;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.UUID;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.TimeUnit;
+
public class ProcessUtils {
+
+ private static final ConcurrentHashMap<String, Process> PROCESS_MAP = new ConcurrentHashMap<>();
+
+ static {
+ Runtime.getRuntime().addShutdownHook(new Thread(() -> {
+ PROCESS_MAP.forEachValue(1, Process::destroyForcibly);
+ }));
+ }
+
+ private static String register(Process p) {
+ String id = UUID.randomUUID().toString();
+ PROCESS_MAP.put(id, p);
+ return id;
+ }
+
+ private static Process release(String id) {
+ return PROCESS_MAP.remove(id);
+ }
+
/**
* This should correctly put double-quotes around an argument if
* ProcessBuilder doesn't seem to work (as it doesn't
@@ -47,4 +73,137 @@ public class ProcessUtils {
}
return arg;
}
+
+ /**
+ * This writes stdout and stderr to the FileProcessResult.
+ *
+ * @param pb
+ * @param timeoutMillis
+ * @param maxStdoutBuffer
+ * @param maxStdErrBuffer
+ * @return
+ * @throws IOException
+ */
+ public static FileProcessResult execute(ProcessBuilder pb,
+ long timeoutMillis,
+ int maxStdoutBuffer, int maxStdErrBuffer)
+ throws IOException {
+ Process p = null;
+ String id = null;
+ try {
+ p = pb.start();
+ id = register(p);
+ long elapsed = -1;
+ long start = System.currentTimeMillis();
+ StreamGobbler outGobbler = new StreamGobbler(p.getInputStream(), maxStdoutBuffer);
+ StreamGobbler errGobbler = new StreamGobbler(p.getErrorStream(), maxStdErrBuffer);
+
+ Thread outThread = new Thread(outGobbler);
+ outThread.start();
+
+ Thread errThread = new Thread(errGobbler);
+ errThread.start();
+ int exitValue = -1;
+ boolean complete = false;
+ try {
+ complete = p.waitFor(timeoutMillis, TimeUnit.MILLISECONDS);
+ elapsed = System.currentTimeMillis() - start;
+ if (complete) {
+ exitValue = p.exitValue();
+ outThread.join(1000);
+ errThread.join(1000);
+ } else {
+ p.destroyForcibly();
+ outThread.join(1000);
+ errThread.join(1000);
+ }
+ } catch (InterruptedException e) {
+ exitValue = -1000;
+ }
+ FileProcessResult result = new FileProcessResult();
+ result.processTimeMillis = elapsed;
+ result.stderrLength = errGobbler.getStreamLength();
+ result.stdoutLength = outGobbler.getStreamLength();
+ result.isTimeout = ! complete;
+ result.exitValue = exitValue;
+ result.stdout = StringUtils.joinWith("\n", outGobbler.getLines());
+ result.stderr = StringUtils.joinWith("\n", errGobbler.getLines());
+ result.stdoutTruncated = outGobbler.getIsTruncated();
+ result.stderrTruncated = errGobbler.getIsTruncated();
+ return result;
+ } finally {
+ if (p != null) {
+ p.destroyForcibly();
+ }
+ if (id != null) {
+ release(id);
+ }
+ }
+ }
+
+ /**
+ * This redirects stdout to stdoutRedirect.
+ *
+ * @param pb
+ * @param timeoutMillis
+ * @param stdoutRedirect
+ * @param maxStdErrBuffer
+ * @return
+ * @throws IOException
+ */
+ public static FileProcessResult execute(ProcessBuilder pb,
+ long timeoutMillis,
+ Path stdoutRedirect, int maxStdErrBuffer) throws IOException {
+
+ if (!Files.isDirectory(stdoutRedirect.getParent())) {
+ Files.createDirectories(stdoutRedirect.getParent());
+ }
+
+ pb.redirectOutput(stdoutRedirect.toFile());
+ Process p = null;
+ String id = null;
+ try {
+ p = pb.start();
+ id = register(p);
+ long elapsed = -1;
+ long start = System.currentTimeMillis();
+ StreamGobbler errGobbler = new StreamGobbler(p.getErrorStream(), maxStdErrBuffer);
+
+ Thread errThread = new Thread(errGobbler);
+ errThread.start();
+ int exitValue = -1;
+ boolean complete = false;
+ try {
+ complete = p.waitFor(timeoutMillis, TimeUnit.MILLISECONDS);
+ elapsed = System.currentTimeMillis() - start;
+ if (complete) {
+ exitValue = p.exitValue();
+ errThread.join(1000);
+ } else {
+ p.destroyForcibly();
+ errThread.join(1000);
+ }
+ } catch (InterruptedException e) {
+ exitValue = -1000;
+ }
+ FileProcessResult result = new FileProcessResult();
+ result.processTimeMillis = elapsed;
+ result.stderrLength = errGobbler.getStreamLength();
+ result.stdoutLength = Files.size(stdoutRedirect);
+ result.isTimeout = !complete;
+ result.exitValue = exitValue;
+ result.stdout = "";
+ result.stderr = StringUtils.joinWith("\n", errGobbler.getLines());
+ result.stdoutTruncated = false;
+ result.stderrTruncated = errGobbler.getIsTruncated();
+ return result;
+ } finally {
+ if (p != null) {
+ p.destroyForcibly();
+ }
+ release(id);
+ }
+
+ }
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
new file mode 100644
index 0000000..7fac035
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.utils;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+public class StreamGobbler implements Runnable {
+
+
+ private final InputStream is;
+ private final int maxBufferLength;
+ List<String> lines = new ArrayList<>();
+ long streamLength = 0;
+ boolean isTruncated = false;
+
+ public StreamGobbler(InputStream is, int maxBufferLength) {
+ this.is = is;
+ this.maxBufferLength = maxBufferLength;
+ }
+
+
+ @Override
+ public void run() {
+
+ try (BufferedReader r = new BufferedReader(
+ new InputStreamReader(is, StandardCharsets.UTF_8))) {
+ String line = r.readLine();
+ while (line != null) {
+ if (maxBufferLength >= 0) {
+ if (streamLength + line.length() > maxBufferLength) {
+ int len = maxBufferLength - (int) streamLength;
+ if (len > 0) {
+ isTruncated = true;
+ String truncatedLine = line.substring(0, Math.min(line.length(), len));
+ lines.add(truncatedLine);
+ }
+ } else {
+ lines.add(line);
+ }
+ }
+ streamLength += line.length();
+ line = r.readLine();
+ }
+ } catch (IOException e) {
+ return;
+ }
+ }
+
+ public List<String> getLines() {
+ return lines;
+ }
+
+ public long getStreamLength() {
+ return streamLength;
+ }
+
+ public boolean getIsTruncated() {
+ return isTruncated;
+ }
+}
\ No newline at end of file
diff --git a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java
index 53fd47c..462cceb 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.utils;
+import java.util.List;
+
public class StringUtils {
/**
@@ -203,4 +205,20 @@ public class StringUtils {
return buf.toString();
}
}
+
+
+ public static String joinWith(String delimiter, List<String> lines) {
+ if (lines.size() == 0) {
+ return EMPTY;
+ }
+ int i = 0;
+ StringBuilder sb = new StringBuilder();
+ for (String line : lines) {
+ if (i++ > 0) {
+ sb.append(delimiter);
+ }
+ sb.append(line);
+ }
+ return sb.toString();
+ }
}