You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/11/17 18:21:52 UTC

[tika] branch main updated: TIKA-3228 -- add file name/extension to FileProfiler and make mime table name distinct.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new d27f148  TIKA-3228 -- add file name/extension to FileProfiler and make mime table name distinct.
d27f148 is described below

commit d27f148ef720559f17bdd16be456de904159ce7e
Author: tallison <ta...@apache.org>
AuthorDate: Tue Nov 17 13:21:17 2020 -0500

    TIKA-3228 -- add file name/extension to FileProfiler and
    make mime table name distinct.
---
 .../org/apache/tika/batch/FileResourceCrawler.java | 18 +++++-----
 .../apache/tika/detect/FileCommandDetector.java    | 14 +++++++-
 .../java/org/apache/tika/eval/FileProfiler.java    | 39 +++++++++++++++++++++-
 .../java/org/apache/tika/eval/TikaEvalCLI.java     |  4 +--
 .../tika/eval/batch/EvalConsumerBuilder.java       |  5 ++-
 .../tika/eval/batch/ExtractComparerBuilder.java    |  5 +++
 .../tika/eval/batch/ExtractProfilerBuilder.java    |  5 +++
 .../tika/eval/batch/FileProfilerBuilder.java       | 10 ++++--
 .../java/org/apache/tika/eval/db/MimeBuffer.java   |  4 +--
 .../resources/tika-eval-file-profiler-config.xml   |  2 +-
 .../main/resources/tika-eval-profiler-config.xml   |  2 +-
 11 files changed, 88 insertions(+), 20 deletions(-)

diff --git a/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java b/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java
index 7f6057b..3456fc3 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java
@@ -113,16 +113,18 @@ public abstract class FileResourceCrawler implements Callable<IFileProcessorFutu
 
         boolean isAdded = false;
         if (select(fileResource.getMetadata())) {
-            long totalConsecutiveWait = 0;
-            while (queue.offer(fileResource, 1L, TimeUnit.SECONDS) == false) {
+            long start = System.currentTimeMillis();
+            while (queue.offer(fileResource, PAUSE_INCREMENT_MILLIS, TimeUnit.MILLISECONDS) == false) {
+                long elapsed = System.currentTimeMillis() - start;
+                LOG.info("FileResourceCrawler is pausing. Queue is full: {} after {} ms",
+                        queue.size(), elapsed);
 
-                LOG.info("FileResourceCrawler is pausing. Queue is full: {}", queue.size());
-                Thread.sleep(PAUSE_INCREMENT_MILLIS);
-                totalConsecutiveWait += PAUSE_INCREMENT_MILLIS;
-                if (maxConsecWaitInMillis > -1 && totalConsecutiveWait > maxConsecWaitInMillis) {
+                if (maxConsecWaitInMillis > -1 && elapsed > maxConsecWaitInMillis) {
                     timedOut = true;
-                    LOG.error("Crawler had to wait longer than max consecutive wait time.");
-                    throw new InterruptedException("FileResourceCrawler had to wait longer than max consecutive wait time.");
+                    String msg = "FileResourceCrawler had to wait longer (" +
+                            elapsed + " ms) than allowed ("+maxConsecWaitInMillis+" ms)";
+                    LOG.error(msg);
+                    throw new InterruptedException(msg);
                 }
                 if (Thread.currentThread().isInterrupted()) {
                     LOG.info("FileResourceCrawler shutting down because of interrupted thread.");
diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index fd851c7..5d7a203 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -78,6 +78,13 @@ public class FileCommandDetector implements Detector {
         return ExternalParser.check(commandline);
     }
 
+    /**
+     *
+     * @param input document input stream, or <code>null</code>
+     * @param metadata input metadata for the document
+     * @return mime as identified by the file command or application/octet-stream otherwise
+     * @throws IOException
+     */
     @Override
     public MediaType detect(InputStream input, Metadata metadata) throws IOException {
         if (hasFileCommand == null) {
@@ -143,7 +150,12 @@ public class FileCommandDetector implements Detector {
             outThread.join();
         } catch (InterruptedException e) {
         }
-        return MediaType.parse(outGobbler.toString().trim());
+        MediaType mt = MediaType.parse(outGobbler.toString().trim());
+        if (mt == null) {
+            return MediaType.OCTET_STREAM;
+        } else {
+            return mt;
+        }
     }
 
     @Field
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
index dcd1751..65908b7 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -20,6 +20,7 @@ import org.apache.commons.cli.HelpFormatter;
 import org.apache.commons.cli.Option;
 import org.apache.commons.cli.Options;
 import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.FilenameUtils;
 import org.apache.tika.Tika;
 import org.apache.tika.batch.FileResource;
 import org.apache.tika.batch.fs.FSProperties;
@@ -30,6 +31,8 @@ import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.eval.io.IDBWriter;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -53,6 +56,8 @@ public class FileProfiler extends AbstractProfiler {
 //TODO: we should allow users to select digest type/encoding and file detector(s).
 
     private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
+    private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class);
+
     static Options OPTIONS;
     static {
 
@@ -90,6 +95,8 @@ public class FileProfiler extends AbstractProfiler {
     public static TableInfo FILE_PROFILES = HAS_FILE ?
             new TableInfo("file_profiles",
                 new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+                new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+                new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
                 new ColInfo(Cols.LENGTH, Types.BIGINT),
                 new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
                 new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
@@ -97,11 +104,18 @@ public class FileProfiler extends AbstractProfiler {
             :
             new TableInfo("file_profiles",
                     new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+                    new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+                    new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
                     new ColInfo(Cols.LENGTH, Types.BIGINT),
                     new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
                     new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
 
 
+    public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes",
+            new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+    );
 
     public static final String DETECT_EXCEPTION = "detect-exception";
     private static final Tika TIKA = new Tika();
@@ -123,9 +137,32 @@ public class FileProfiler extends AbstractProfiler {
                 Path path = tis.getPath();
                 Map<Cols, String> data = new HashMap<>();
                 int tikaMimeId = writer.getMimeId(detectTika(tis));
+                String fileName = "";
+                String extension = "";
+                long length = -1;
+                try {
+                    fileName = FilenameUtils.getName(relPath);
+                } catch (IllegalArgumentException e) {
+                    LOG.warn("bad file name: "+relPath, e);
+                }
+
+                try {
+                    extension = FilenameUtils.getExtension(relPath);
+                } catch (IllegalArgumentException e) {
+                    LOG.warn("bad extension: "+relPath, e);
+                }
+
+                try {
+                    length = Files.size(path);
+                } catch (IOException e) {
+                    LOG.warn("problem getting size: "+relPath, e);
+                }
+
                 data.put(Cols.FILE_PATH, relPath);
+                data.put(Cols.FILE_NAME, fileName);
+                data.put(Cols.FILE_EXTENSION, extension);
+                data.put(Cols.LENGTH, Long.toString(length));
                 data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
-                data.put(Cols.LENGTH, Long.toString(Files.size(path)));
                 data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
                 if (HAS_FILE) {
                     int fileMimeId = writer.getMimeId(detectFile(tis));
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
index 3910878..d678c2b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.eval;
 
-import java.io.IOException;
 import java.io.InputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -34,7 +33,8 @@ import org.apache.tika.eval.reports.ResultsReporter;
 import org.h2.tools.Console;
 
 public class TikaEvalCLI {
-    static final String[] tools = {"Profile", "Compare", "Report", "StartDB"};
+    static final String[] tools = {"Profile", "FileProfile",
+            "Compare", "Report", "StartDB"};
 
     private static String specifyTools() {
         StringBuilder sb = new StringBuilder();
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index 6f407f6..9953973 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -76,7 +76,8 @@ public abstract class EvalConsumerBuilder {
         dbUtil.createTables(getRefTableInfos(), createRefTable);
 
         //step 3. create mime buffer
-        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), TikaConfig.getDefaultConfig());
+        this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
+                TikaConfig.getDefaultConfig());
 
         //step 4. populate the reference tables
         populateRefTables();
@@ -100,6 +101,8 @@ public abstract class EvalConsumerBuilder {
      */
     protected abstract List<TableInfo> getNonRefTableInfos();
 
+    protected abstract TableInfo getMimeTable();
+
     protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
 
     public void populateRefTables() throws IOException, SQLException {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index 909032c..0ae893f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -143,6 +143,11 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
     }
 
     @Override
+    protected TableInfo getMimeTable() {
+        return AbstractProfiler.MIME_TABLE;
+    }
+
+    @Override
     protected void addErrorLogTablePairs(DBConsumersManager manager) {
         Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null);
         if (errorLogA == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index 729460b..20efbf7 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -107,6 +107,11 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
     }
 
     @Override
+    protected TableInfo getMimeTable() {
+        return AbstractProfiler.MIME_TABLE;
+    }
+
+    @Override
     protected void addErrorLogTablePairs(DBConsumersManager manager) {
         Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
         if (errorLog == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
index 0ba7bea..a54dd55 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
@@ -17,14 +17,12 @@
 package org.apache.tika.eval.batch;
 
 import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
 import org.apache.tika.eval.ExtractProfiler;
 import org.apache.tika.eval.FileProfiler;
 import org.apache.tika.eval.db.TableInfo;
 import org.apache.tika.util.PropsUtil;
 
 import java.io.IOException;
-import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.ArrayList;
@@ -38,9 +36,10 @@ public class FileProfilerBuilder extends EvalConsumerBuilder {
     public final static String TABLE_PREFIX_KEY = "tablePrefix";
 
     private final List<TableInfo> tableInfos;
+
     public FileProfilerBuilder() {
         List<TableInfo> tableInfos = new ArrayList();
-        tableInfos.add(AbstractProfiler.MIME_TABLE);
+        tableInfos.add(FileProfiler.FILE_MIME_TABLE);
         tableInfos.add(FileProfiler.FILE_PROFILES);
         this.tableInfos = Collections.unmodifiableList(tableInfos);
 
@@ -83,6 +82,11 @@ public class FileProfilerBuilder extends EvalConsumerBuilder {
     }
 
     @Override
+    protected TableInfo getMimeTable() {
+        return FileProfiler.FILE_MIME_TABLE;
+    }
+
+    @Override
     protected void addErrorLogTablePairs(DBConsumersManager manager) {
         Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
         if (errorLog == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
index 9f6b136..eba731b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -36,8 +36,8 @@ public class MimeBuffer extends AbstractDBBuffer {
     private final Connection connection;
 
 
-    public MimeBuffer(Connection connection, TikaConfig config) throws SQLException {
-        st = connection.prepareStatement("insert into " + AbstractProfiler.MIME_TABLE.getName() + "( " +
+    public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException {
+        st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " +
                 Cols.MIME_ID.name() + ", " +
                 Cols.MIME_STRING.name() + ", " +
                 Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
index 6a7867a..a22523f 100644
--- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -56,7 +56,7 @@
     <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
 
              crawlOrder="sorted"
-             maxConsecWaitMillis="5000"
+             maxConsecWaitMillis="300000"
              maxFilesToAdd="-1"
              maxFilesToConsider="-1"
              includeFilePat=""
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index e5f090f..030bd3f 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -75,7 +75,7 @@
     <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
 
              crawlOrder="sorted"
-             maxConsecWaitMillis="5000"
+             maxConsecWaitMillis="300000"
              maxFilesToAdd="-1"
              maxFilesToConsider="-1"
              includeFilePat=""