You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/11/17 18:21:52 UTC
[tika] branch main updated: TIKA-3228 -- add file name/extension to
FileProfiler and make mime table name distinct.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d27f148 TIKA-3228 -- add file name/extension to FileProfiler and make mime table name distinct.
d27f148 is described below
commit d27f148ef720559f17bdd16be456de904159ce7e
Author: tallison <ta...@apache.org>
AuthorDate: Tue Nov 17 13:21:17 2020 -0500
TIKA-3228 -- add file name/extension to FileProfiler and
make mime table name distinct.
---
.../org/apache/tika/batch/FileResourceCrawler.java | 18 +++++-----
.../apache/tika/detect/FileCommandDetector.java | 14 +++++++-
.../java/org/apache/tika/eval/FileProfiler.java | 39 +++++++++++++++++++++-
.../java/org/apache/tika/eval/TikaEvalCLI.java | 4 +--
.../tika/eval/batch/EvalConsumerBuilder.java | 5 ++-
.../tika/eval/batch/ExtractComparerBuilder.java | 5 +++
.../tika/eval/batch/ExtractProfilerBuilder.java | 5 +++
.../tika/eval/batch/FileProfilerBuilder.java | 10 ++++--
.../java/org/apache/tika/eval/db/MimeBuffer.java | 4 +--
.../resources/tika-eval-file-profiler-config.xml | 2 +-
.../main/resources/tika-eval-profiler-config.xml | 2 +-
11 files changed, 88 insertions(+), 20 deletions(-)
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java b/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java
index 7f6057b..3456fc3 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/FileResourceCrawler.java
@@ -113,16 +113,18 @@ public abstract class FileResourceCrawler implements Callable<IFileProcessorFutu
boolean isAdded = false;
if (select(fileResource.getMetadata())) {
- long totalConsecutiveWait = 0;
- while (queue.offer(fileResource, 1L, TimeUnit.SECONDS) == false) {
+ long start = System.currentTimeMillis();
+ while (queue.offer(fileResource, PAUSE_INCREMENT_MILLIS, TimeUnit.MILLISECONDS) == false) {
+ long elapsed = System.currentTimeMillis() - start;
+ LOG.info("FileResourceCrawler is pausing. Queue is full: {} after {} ms",
+ queue.size(), elapsed);
- LOG.info("FileResourceCrawler is pausing. Queue is full: {}", queue.size());
- Thread.sleep(PAUSE_INCREMENT_MILLIS);
- totalConsecutiveWait += PAUSE_INCREMENT_MILLIS;
- if (maxConsecWaitInMillis > -1 && totalConsecutiveWait > maxConsecWaitInMillis) {
+ if (maxConsecWaitInMillis > -1 && elapsed > maxConsecWaitInMillis) {
timedOut = true;
- LOG.error("Crawler had to wait longer than max consecutive wait time.");
- throw new InterruptedException("FileResourceCrawler had to wait longer than max consecutive wait time.");
+ String msg = "FileResourceCrawler had to wait longer (" +
+ elapsed + " ms) than allowed ("+maxConsecWaitInMillis+" ms)";
+ LOG.error(msg);
+ throw new InterruptedException(msg);
}
if (Thread.currentThread().isInterrupted()) {
LOG.info("FileResourceCrawler shutting down because of interrupted thread.");
diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index fd851c7..5d7a203 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -78,6 +78,13 @@ public class FileCommandDetector implements Detector {
return ExternalParser.check(commandline);
}
+ /**
+ *
+ * @param input document input stream, or <code>null</code>
+ * @param metadata input metadata for the document
+ * @return mime as identified by the file command or application/octet-stream otherwise
+ * @throws IOException
+ */
@Override
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
if (hasFileCommand == null) {
@@ -143,7 +150,12 @@ public class FileCommandDetector implements Detector {
outThread.join();
} catch (InterruptedException e) {
}
- return MediaType.parse(outGobbler.toString().trim());
+ MediaType mt = MediaType.parse(outGobbler.toString().trim());
+ if (mt == null) {
+ return MediaType.OCTET_STREAM;
+ } else {
+ return mt;
+ }
}
@Field
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
index dcd1751..65908b7 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -20,6 +20,7 @@ import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.fs.FSProperties;
@@ -30,6 +31,8 @@ import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStream;
@@ -53,6 +56,8 @@ public class FileProfiler extends AbstractProfiler {
//TODO: we should allow users to select digest type/encoding and file detector(s).
private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
+ private static final Logger LOG = LoggerFactory.getLogger(FileProfiler.class);
+
static Options OPTIONS;
static {
@@ -90,6 +95,8 @@ public class FileProfiler extends AbstractProfiler {
public static TableInfo FILE_PROFILES = HAS_FILE ?
new TableInfo("file_profiles",
new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+ new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
new ColInfo(Cols.LENGTH, Types.BIGINT),
new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
@@ -97,11 +104,18 @@ public class FileProfiler extends AbstractProfiler {
:
new TableInfo("file_profiles",
new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"),
+ new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 2048),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 24),
new ColInfo(Cols.LENGTH, Types.BIGINT),
new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
+ public static TableInfo FILE_MIME_TABLE = new TableInfo("file_mimes",
+ new ColInfo(Cols.MIME_ID, Types.INTEGER, "PRIMARY KEY"),
+ new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+ new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+ );
public static final String DETECT_EXCEPTION = "detect-exception";
private static final Tika TIKA = new Tika();
@@ -123,9 +137,32 @@ public class FileProfiler extends AbstractProfiler {
Path path = tis.getPath();
Map<Cols, String> data = new HashMap<>();
int tikaMimeId = writer.getMimeId(detectTika(tis));
+ String fileName = "";
+ String extension = "";
+ long length = -1;
+ try {
+ fileName = FilenameUtils.getName(relPath);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("bad file name: "+relPath, e);
+ }
+
+ try {
+ extension = FilenameUtils.getExtension(relPath);
+ } catch (IllegalArgumentException e) {
+ LOG.warn("bad extension: "+relPath, e);
+ }
+
+ try {
+ length = Files.size(path);
+ } catch (IOException e) {
+ LOG.warn("problem getting size: "+relPath, e);
+ }
+
data.put(Cols.FILE_PATH, relPath);
+ data.put(Cols.FILE_NAME, fileName);
+ data.put(Cols.FILE_EXTENSION, extension);
+ data.put(Cols.LENGTH, Long.toString(length));
data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
- data.put(Cols.LENGTH, Long.toString(Files.size(path)));
data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
if (HAS_FILE) {
int fileMimeId = writer.getMimeId(detectFile(tis));
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
index 3910878..d678c2b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.eval;
-import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -34,7 +33,8 @@ import org.apache.tika.eval.reports.ResultsReporter;
import org.h2.tools.Console;
public class TikaEvalCLI {
- static final String[] tools = {"Profile", "Compare", "Report", "StartDB"};
+ static final String[] tools = {"Profile", "FileProfile",
+ "Compare", "Report", "StartDB"};
private static String specifyTools() {
StringBuilder sb = new StringBuilder();
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index 6f407f6..9953973 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -76,7 +76,8 @@ public abstract class EvalConsumerBuilder {
dbUtil.createTables(getRefTableInfos(), createRefTable);
//step 3. create mime buffer
- this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), TikaConfig.getDefaultConfig());
+ this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), getMimeTable(),
+ TikaConfig.getDefaultConfig());
//step 4. populate the reference tables
populateRefTables();
@@ -100,6 +101,8 @@ public abstract class EvalConsumerBuilder {
*/
protected abstract List<TableInfo> getNonRefTableInfos();
+ protected abstract TableInfo getMimeTable();
+
protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
public void populateRefTables() throws IOException, SQLException {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
index 909032c..0ae893f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractComparerBuilder.java
@@ -143,6 +143,11 @@ public class ExtractComparerBuilder extends EvalConsumerBuilder {
}
@Override
+ protected TableInfo getMimeTable() {
+ return AbstractProfiler.MIME_TABLE;
+ }
+
+ @Override
protected void addErrorLogTablePairs(DBConsumersManager manager) {
Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null);
if (errorLogA == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
index 729460b..20efbf7 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/ExtractProfilerBuilder.java
@@ -107,6 +107,11 @@ public class ExtractProfilerBuilder extends EvalConsumerBuilder {
}
@Override
+ protected TableInfo getMimeTable() {
+ return AbstractProfiler.MIME_TABLE;
+ }
+
+ @Override
protected void addErrorLogTablePairs(DBConsumersManager manager) {
Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
if (errorLog == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
index 0ba7bea..a54dd55 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
@@ -17,14 +17,12 @@
package org.apache.tika.eval.batch;
import org.apache.tika.batch.FileResourceConsumer;
-import org.apache.tika.eval.AbstractProfiler;
import org.apache.tika.eval.ExtractProfiler;
import org.apache.tika.eval.FileProfiler;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.util.PropsUtil;
import java.io.IOException;
-import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.ArrayList;
@@ -38,9 +36,10 @@ public class FileProfilerBuilder extends EvalConsumerBuilder {
public final static String TABLE_PREFIX_KEY = "tablePrefix";
private final List<TableInfo> tableInfos;
+
public FileProfilerBuilder() {
List<TableInfo> tableInfos = new ArrayList();
- tableInfos.add(AbstractProfiler.MIME_TABLE);
+ tableInfos.add(FileProfiler.FILE_MIME_TABLE);
tableInfos.add(FileProfiler.FILE_PROFILES);
this.tableInfos = Collections.unmodifiableList(tableInfos);
@@ -83,6 +82,11 @@ public class FileProfilerBuilder extends EvalConsumerBuilder {
}
@Override
+ protected TableInfo getMimeTable() {
+ return FileProfiler.FILE_MIME_TABLE;
+ }
+
+ @Override
protected void addErrorLogTablePairs(DBConsumersManager manager) {
Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null);
if (errorLog == null) {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
index 9f6b136..eba731b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -36,8 +36,8 @@ public class MimeBuffer extends AbstractDBBuffer {
private final Connection connection;
- public MimeBuffer(Connection connection, TikaConfig config) throws SQLException {
- st = connection.prepareStatement("insert into " + AbstractProfiler.MIME_TABLE.getName() + "( " +
+ public MimeBuffer(Connection connection, TableInfo mimeTable, TikaConfig config) throws SQLException {
+ st = connection.prepareStatement("insert into " + mimeTable.getName() + "( " +
Cols.MIME_ID.name() + ", " +
Cols.MIME_STRING.name() + ", " +
Cols.FILE_EXTENSION.name() + ") values (?,?,?)");
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
index 6a7867a..a22523f 100644
--- a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -56,7 +56,7 @@
<crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
crawlOrder="sorted"
- maxConsecWaitMillis="5000"
+ maxConsecWaitMillis="300000"
maxFilesToAdd="-1"
maxFilesToConsider="-1"
includeFilePat=""
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
index e5f090f..030bd3f 100644
--- a/tika-eval/src/main/resources/tika-eval-profiler-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -75,7 +75,7 @@
<crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
crawlOrder="sorted"
- maxConsecWaitMillis="5000"
+ maxConsecWaitMillis="300000"
maxFilesToAdd="-1"
maxFilesToConsider="-1"
includeFilePat=""