You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/09 13:17:10 UTC
[tika] 01/03: add timeout threshold for fileprofiler
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit faaaca7a6334ae53cf512c3b043cfd701793b00e
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 9 08:03:18 2021 -0500
add timeout threshold for fileprofiler
---
.../java/org/apache/tika/eval/FileProfiler.java | 23 ++++++++++++++--------
.../resources/tika-eval-file-profiler-config.xml | 1 +
2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
index 65908b7..12c3ef4 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -75,7 +75,7 @@ public class FileProfiler extends AbstractProfiler {
.addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
.addOption("drop", false, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-
+ .addOption("timeoutThresholdMillis", true, "timeout per file in milliseconds")
;
}
@@ -135,11 +135,20 @@ public class FileProfiler extends AbstractProfiler {
try (InputStream is = fileResource.openInputStream()) {
try (TikaInputStream tis = TikaInputStream.get(is)) {
Path path = tis.getPath();
+ long length = -1;
Map<Cols, String> data = new HashMap<>();
+ try {
+ length = Files.size(path);
+ } catch (IOException e) {
+ LOG.warn("problem getting size: "+relPath, e);
+ }
+ long start = System.currentTimeMillis();
int tikaMimeId = writer.getMimeId(detectTika(tis));
+ long elapsed = System.currentTimeMillis()-start;
+ LOG.debug("took "+elapsed+ " ms for tika detect on length "+length);
String fileName = "";
String extension = "";
- long length = -1;
+
try {
fileName = FilenameUtils.getName(relPath);
} catch (IllegalArgumentException e) {
@@ -152,12 +161,6 @@ public class FileProfiler extends AbstractProfiler {
LOG.warn("bad extension: "+relPath, e);
}
- try {
- length = Files.size(path);
- } catch (IOException e) {
- LOG.warn("problem getting size: "+relPath, e);
- }
-
data.put(Cols.FILE_PATH, relPath);
data.put(Cols.FILE_NAME, fileName);
data.put(Cols.FILE_EXTENSION, extension);
@@ -165,7 +168,11 @@ public class FileProfiler extends AbstractProfiler {
data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
if (HAS_FILE) {
+ start = System.currentTimeMillis();
int fileMimeId = writer.getMimeId(detectFile(tis));
+ elapsed = System.currentTimeMillis()-start;
+ LOG.debug("took "+elapsed+ " ms for file detect on length "+length);
+
data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
}
writer.writeRow(FILE_PROFILES, data);
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
index a22523f..c253cbe 100644
--- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -45,6 +45,7 @@
description="EXPERT: prefix for table names"/>
<option opt="drop" hasArg="false" description="drop tables if they exist"/>
<option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+ <option opt="timeoutThresholdMillis" hasArg="true" description="timeout per file in milliseconds"/>
</commandline>