You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2021/03/09 13:17:09 UTC

[tika] branch branch_1x updated (d786d95 -> 9f059bd)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from d786d95  rollback for 1.26-SNAPSHOT development
     new faaaca7  add timeout threshold for fileprofiler
     new 2475d07  TIKA-3314
     new 9f059bd  TIKA-3315 add xerces to tika-eval

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 tika-eval/pom.xml                                  |  5 +++++
 .../java/org/apache/tika/eval/FileProfiler.java    | 23 ++++++++++++++--------
 .../src/main/resources/lucene-char-mapping.txt     |  3 ++-
 .../resources/tika-eval-file-profiler-config.xml   |  1 +
 4 files changed, 23 insertions(+), 9 deletions(-)


[tika] 01/03: add timeout threshold for fileprofiler

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit faaaca7a6334ae53cf512c3b043cfd701793b00e
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 9 08:03:18 2021 -0500

    add timeout threshold for fileprofiler
---
 .../java/org/apache/tika/eval/FileProfiler.java    | 23 ++++++++++++++--------
 .../resources/tika-eval-file-profiler-config.xml   |  1 +
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
index 65908b7..12c3ef4 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -75,7 +75,7 @@ public class FileProfiler extends AbstractProfiler {
                 .addOption("tablePrefix", true, "EXPERT: optional prefix for table names")
                 .addOption("drop", false, "drop tables if they exist")
                 .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler")
-
+                .addOption("timeoutThresholdMillis", true, "timeout per file in milliseconds")
         ;
 
     }
@@ -135,11 +135,20 @@ public class FileProfiler extends AbstractProfiler {
         try (InputStream is = fileResource.openInputStream()) {
             try (TikaInputStream tis = TikaInputStream.get(is)) {
                 Path path = tis.getPath();
+                long length = -1;
                 Map<Cols, String> data = new HashMap<>();
+                try {
+                    length = Files.size(path);
+                } catch (IOException e) {
+                    LOG.warn("problem getting size: "+relPath, e);
+                }
+                long start = System.currentTimeMillis();
                 int tikaMimeId = writer.getMimeId(detectTika(tis));
+                long elapsed = System.currentTimeMillis()-start;
+                LOG.debug("took "+elapsed+ " ms for tika detect on length "+length);
                 String fileName = "";
                 String extension = "";
-                long length = -1;
+
                 try {
                     fileName = FilenameUtils.getName(relPath);
                 } catch (IllegalArgumentException e) {
@@ -152,12 +161,6 @@ public class FileProfiler extends AbstractProfiler {
                     LOG.warn("bad extension: "+relPath, e);
                 }
 
-                try {
-                    length = Files.size(path);
-                } catch (IOException e) {
-                    LOG.warn("problem getting size: "+relPath, e);
-                }
-
                 data.put(Cols.FILE_PATH, relPath);
                 data.put(Cols.FILE_NAME, fileName);
                 data.put(Cols.FILE_EXTENSION, extension);
@@ -165,7 +168,11 @@ public class FileProfiler extends AbstractProfiler {
                 data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
                 data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
                 if (HAS_FILE) {
+                    start = System.currentTimeMillis();
                     int fileMimeId = writer.getMimeId(detectFile(tis));
+                    elapsed = System.currentTimeMillis()-start;
+                    LOG.debug("took "+elapsed+ " ms for file detect on length "+length);
+
                     data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
                 }
                 writer.writeRow(FILE_PROFILES, data);
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
index a22523f..c253cbe 100644
--- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -45,6 +45,7 @@
                 description="EXPERT: prefix for table names"/>
         <option opt="drop" hasArg="false" description="drop tables if they exist"/>
         <option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/>
+        <option opt="timeoutThresholdMillis" hasArg="true" description="timeout per file in milliseconds"/>
 
     </commandline>
 


[tika] 02/03: TIKA-3314

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 2475d075cf22993aea8f85524830ce922fd5b208
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 9 08:04:45 2021 -0500

    TIKA-3314
---
 tika-eval/src/main/resources/lucene-char-mapping.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tika-eval/src/main/resources/lucene-char-mapping.txt b/tika-eval/src/main/resources/lucene-char-mapping.txt
index 9cd7787..b082df6 100644
--- a/tika-eval/src/main/resources/lucene-char-mapping.txt
+++ b/tika-eval/src/main/resources/lucene-char-mapping.txt
@@ -13,4 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 "\u2018" => "'"
-"\u2019" => "'"
\ No newline at end of file
+"\u2019" => "'"
+"\u00AD" => "-"
\ No newline at end of file


[tika] 03/03: TIKA-3315 add xerces to tika-eval

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9f059bd031da242b019b2db20c5104ce08430f73
Author: tallison <ta...@apache.org>
AuthorDate: Tue Mar 9 08:16:39 2021 -0500

    TIKA-3315 add xerces to tika-eval
---
 tika-eval/pom.xml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
index 110bff3..f73451a 100644
--- a/tika-eval/pom.xml
+++ b/tika-eval/pom.xml
@@ -40,6 +40,11 @@
             <version>${project.version}</version>
         </dependency>
         <dependency>
+            <groupId>xerces</groupId>
+            <artifactId>xercesImpl</artifactId>
+            <version>2.12.1</version>
+        </dependency>
+        <dependency>
             <groupId>${project.groupId}</groupId>
             <artifactId>tika-batch</artifactId>
             <version>${project.version}</version>