You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/02/28 14:26:36 UTC
(tika) 01/01: TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4205
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3d4c94308d5b64ecc7306850b0ec935e615e7c6f
Author: tallison <ta...@apache.org>
AuthorDate: Wed Feb 28 09:26:21 2024 -0500
TIKA-4205 -- fix dependencies in tika-eval-app and add a few more columns to the ExtractProfiler
---
tika-eval/tika-eval-app/pom.xml | 2 --
.../java/org/apache/tika/eval/app/AbstractProfiler.java | 17 ++++++++++++++++-
.../java/org/apache/tika/eval/app/ExtractProfiler.java | 4 ++++
.../src/main/java/org/apache/tika/eval/app/db/Cols.java | 3 +++
4 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/tika-eval/tika-eval-app/pom.xml b/tika-eval/tika-eval-app/pom.xml
index 18671052c..b93783f75 100644
--- a/tika-eval/tika-eval-app/pom.xml
+++ b/tika-eval/tika-eval-app/pom.xml
@@ -93,10 +93,8 @@
</createDependencyReducedPom>
<artifactSet>
<excludes>
- <exclude>org.apache.tika:tika-core:jar:</exclude>
<exclude>org.apache.tika:tika-serialization:jar:</exclude>
<exclude>org.apache.tika:tika-langdetect-opennlp:jar:</exclude>
- <exclude>commons-io:commons-io:jar:</exclude>
<exclude>commons-codec:commons-codec:jar:</exclude>
<exclude>org.apache.commons:commons-lang3:jar:</exclude>
<exclude>org.apache.commons:commons-math3:jar:</exclude>
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
index 2397bbcab..0cd609d3b 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/AbstractProfiler.java
@@ -71,6 +71,7 @@ import org.apache.tika.eval.core.util.EvalExceptionUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.detect.LanguageResult;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PDF;
import org.apache.tika.metadata.PagedText;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -387,6 +388,10 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
if (nPages != null) {
data.put(Cols.NUM_PAGES, Integer.toString(nPages));
}
+ Integer nOCRPages = m.getInt(PDF.OCR_PAGE_COUNT);
+ if (nOCRPages != null) {
+ data.put(Cols.NUM_OCR_PAGES, Integer.toString(nOCRPages));
+ }
//if the outer wrapper document
if (i == 0) {
@@ -395,10 +400,17 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
data.put(Cols.EMBEDDED_DEPTH, "0");
} else {
data.put(Cols.IS_EMBEDDED, TRUE);
- data.put(Cols.FILE_NAME, getFileName(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)));
+ String embeddedFilePath = m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
+ if (! StringUtils.isBlank(embeddedFilePath)) {
+ data.put(Cols.FILE_NAME, getFileName(m.get(embeddedFilePath)));
+ data.put(Cols.EMBEDDED_FILE_PATH, embeddedFilePath);
+ }
if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_DEPTH))) {
data.put(Cols.EMBEDDED_DEPTH, m.get(TikaCoreProperties.EMBEDDED_DEPTH));
}
+ if (!StringUtils.isBlank(m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE))) {
+ data.put(Cols.ATTACHMENT_TYPE, m.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
}
String ext = FilenameUtils.getExtension(data.get(Cols.FILE_NAME));
ext = (ext == null) ? "" : ext.toLowerCase(Locale.US);
@@ -486,6 +498,8 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
Integer.toString(commonTokenResult.getUniqueAlphabeticTokens()));
data.put(Cols.NUM_ALPHABETIC_TOKENS,
Integer.toString(commonTokenResult.getAlphabeticTokens()));
+ double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0;
+ data.put(Cols.OOV, Double.toString(oov));
}
TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class);
if (tokenCounts != null) {
@@ -498,6 +512,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer {
Double.toString((Double) textStats.get(TokenEntropy.class)));
}
+
SummaryStatistics summStats = (SummaryStatistics) textStats.get(TokenLengths.class);
if (summStats != null) {
data.put(Cols.TOKEN_LENGTH_SUM, Integer.toString((int) summStats.getSum()));
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
index ad0ce0bac..4e7d45088 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java
@@ -64,12 +64,15 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.MD5, Types.CHAR, 32), new ColInfo(Cols.LENGTH, Types.BIGINT),
new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
new ColInfo(Cols.EMBEDDED_DEPTH, Types.INTEGER),
+ new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024),
+ new ColInfo(Cols.ATTACHMENT_TYPE, Types.VARCHAR, 32),
new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
new ColInfo(Cols.MIME_ID, Types.INTEGER),
new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
+ new ColInfo(Cols.NUM_OCR_PAGES, Types.INTEGER),
new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN));
public static TableInfo EMBEDDED_FILE_PATH_TABLE =
new TableInfo("emb_file_names", new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
@@ -84,6 +87,7 @@ public class ExtractProfiler extends AbstractProfiler {
new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER),
new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
+ new ColInfo(Cols.OOV, Types.DOUBLE),
new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
index b6f617ce6..35d70b430 100644
--- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
+++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java
@@ -26,6 +26,7 @@ public enum Cols {
//profile table
ID, LENGTH, FILE_NAME, FILE_EXTENSION, ELAPSED_TIME_MILLIS, NUM_METADATA_VALUES, IS_EMBEDDED,
EMBEDDED_FILE_PATH, MIME_ID, TIKA_MIME_ID, FILE_MIME_ID, SHA256, MD5, NUM_ATTACHMENTS,
+ ATTACHMENT_TYPE,
EMBEDDED_DEPTH,
HAS_CONTENT,
@@ -34,8 +35,10 @@ public enum Cols {
NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
NUM_UNIQUE_COMMON_TOKENS, NUM_COMMON_TOKENS, TOP_N_TOKENS, LANG_ID_1, LANG_ID_PROB_1, LANG_ID_2,
+ OOV,
LANG_ID_PROB_2, TOKEN_ENTROPY_RATE, TOKEN_LENGTH_SUM, TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV,
UNICODE_CHAR_BLOCKS, NUM_PAGES, //number of pages a document alleges it has
+ NUM_OCR_PAGES,
CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH
//content comparisons