You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/09 13:17:10 UTC

[tika] 01/03: add timeout threshold for fileprofiler

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit faaaca7a6334ae53cf512c3b043cfd701793b00e
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 9 08:03:18 2021 -0500

    add timeout threshold for fileprofiler
---
 .../java/org/apache/tika/eval/FileProfiler.java    | 23 ++++++++++++++--------
 .../resources/tika-eval-file-profiler-config.xml   |  1 +
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
index 65908b7..12c3ef4 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -75,7 +75,7 @@ public class FileProfiler extends AbstractProfiler {
                 .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
                 .addOption("drop", false, "drop tables if they exist")
                 .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-
+                .addOption("timeoutThresholdMillis", true, "timeout per file in milliseconds")
         ;
 
     }
@@ -135,11 +135,20 @@ public class FileProfiler extends AbstractProfiler {
         try (InputStream is = fileResource.openInputStream()) {
             try (TikaInputStream tis = TikaInputStream.get(is)) {
                 Path path = tis.getPath();
+                long length = -1;
                 Map<Cols, String> data = new HashMap<>();
+                try {
+                    length = Files.size(path);
+                } catch (IOException e) {
+                    LOG.warn("problem getting size: "+relPath, e);
+                }
+                long start = System.currentTimeMillis();
                 int tikaMimeId = writer.getMimeId(detectTika(tis));
+                long elapsed = System.currentTimeMillis()-start;
+                LOG.debug("took "+elapsed+ " ms for tika detect on length "+length);
                 String fileName = "";
                 String extension = "";
-                long length = -1;
+
                 try {
                     fileName = FilenameUtils.getName(relPath);
                 } catch (IllegalArgumentException e) {
@@ -152,12 +161,6 @@ public class FileProfiler extends AbstractProfiler {
                     LOG.warn("bad extension: "+relPath, e);
                 }
 
-                try {
-                    length = Files.size(path);
-                } catch (IOException e) {
-                    LOG.warn("problem getting size: "+relPath, e);
-                }
-
                 data.put(Cols.FILE_PATH, relPath);
                 data.put(Cols.FILE_NAME, fileName);
                 data.put(Cols.FILE_EXTENSION, extension);
@@ -165,7 +168,11 @@ public class FileProfiler extends AbstractProfiler {
                 data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
                 data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
                 if (HAS_FILE) {
+                    start = System.currentTimeMillis();
                     int fileMimeId = writer.getMimeId(detectFile(tis));
+                    elapsed = System.currentTimeMillis()-start;
+                    LOG.debug("took "+elapsed+ " ms for file detect on length "+length);
+
                     data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
                 }
                 writer.writeRow(FILE_PROFILES, data);
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
index a22523f..c253cbe 100644
--- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -45,6 +45,7 @@
                 description="EXPERT: prefix for table names"/>
         <option opt="drop" hasArg="false" description="drop tables if they exist"/>
         <option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+        <option opt="timeoutThresholdMillis" hasArg="true" description="timeout per file in milliseconds"/>
 
     </commandline>