You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/03 17:37:34 UTC
[tika] branch main updated: TIKA-3915 -- extract errors field from siegfried; add test for warnings
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 1a33c7ec2 TIKA-3915 -- extract errors field from siegfried; add test for warnings
1a33c7ec2 is described below
commit 1a33c7ec287f7cbcfde399a74dc1428a16210a2d
Author: tballison <ta...@apache.org>
AuthorDate: Thu Nov 3 13:37:22 2022 -0400
TIKA-3915 -- extract errors field from siegfried; add test for warnings
---
.../tika/detect/siegfried/SiegfriedDetector.java | 19 ++++++++++--
.../detect/siegfried/TestSiegfriedJsonParsing.java | 35 ++++++++++++++++++++++
.../src/test/resources/json/test-basic.json | 2 +-
.../src/test/resources/json/test-errors.json | 1 +
.../src/test/resources/json/test-warnings.json | 1 +
5 files changed, 55 insertions(+), 3 deletions(-)
diff --git a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
index 5a2c11079..fe42a9f3d 100644
--- a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
+++ b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
@@ -69,6 +69,9 @@ public class SiegfriedDetector implements Detector {
public static Property SIEGFRIED_IDENTIFIERS_DETAILS =
Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_details");
+ public static Property SIEGFRIED_ERRORS =
+ Property.externalTextBag(SIEGFRIED_PREFIX + "errors");
+
//TODO -- grab errors and warnings
public static String ID = "id";
@@ -78,6 +81,8 @@ public class SiegfriedDetector implements Detector {
public static String WARNING = "warning";
public static String BASIS = "basis";
+ public static String ERRORS = "errors";
+
private static final Logger LOGGER = LoggerFactory.getLogger(SiegfriedDetector.class);
private static final long DEFAULT_TIMEOUT_MS = 6000;
private static final String DEFAULT_SIEGFRIED_PATH = "sf";
@@ -203,8 +208,18 @@ public class SiegfriedDetector implements Detector {
MediaType mt = MediaType.OCTET_STREAM;
if (root.has("files")) {
for (JsonNode file : root.get("files")) {
- //TODO
-/// String errors = file.get("errors").asText("");
+
+ if (file.has(ERRORS)) {
+ JsonNode errors = file.get(ERRORS);
+ if (errors.isTextual()) {
+ metadata.add(SIEGFRIED_ERRORS, file.get(ERRORS).asText());
+ } else if (errors.isArray()) {
+ //is this even possible?!
+ for (JsonNode e : errors) {
+ metadata.add(SIEGFRIED_ERRORS, e.asText());
+ }
+ }
+ }
for (JsonNode match : file.get("matches")) {
String ns = match.has("ns") ? match.get("ns").asText(StringUtils.EMPTY) :
StringUtils.EMPTY;
diff --git a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java
index c438d2c87..237eab51f 100644
--- a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java
+++ b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java
@@ -17,6 +17,8 @@
package org.apache.tika.detect.siegfried;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
@@ -47,6 +49,39 @@ public class TestSiegfriedJsonParsing extends TikaTest {
}
+ @Test
+ public void testErrors() throws Exception {
+ FileProcessResult fileProcessResult = load("test-errors.json");
+ Metadata metadata = new Metadata();
+ SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+ //debug(metadata);
+ assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+ assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+ assertEquals("x-fmt/111", metadata.get("sf:pronom:id"));
+ assertEquals("extension match txt", metadata.get("sf:pronom:basis"));
+ assertEquals("Plain Text File", metadata.get("sf:pronom:format"));
+ assertEquals("text/plain", metadata.get("sf:pronom:mime"));
+ assertNull(metadata.get("sf:pronom:version"));
+ assertEquals("empty source", metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS));
+ }
+
+ @Test
+ public void testWarnings() throws Exception {
+ FileProcessResult fileProcessResult = load("test-warnings.json");
+ Metadata metadata = new Metadata();
+ SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+ assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+ assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+ assertEquals("UNKNOWN", metadata.get("sf:pronom:id"));
+ assertNull(metadata.get("sf:pronom:basis"));
+ assertNull(metadata.get("sf:pronom:format"));
+ assertNull(metadata.get("sf:pronom:mime"));
+ assertNull(metadata.get("sf:pronom:version"));
+ assertTrue(metadata.get("sf:pronom:warning")
+ .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " +
+ "fmt/17, fmt/18, fmt/19"));
+ }
+
diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json
index 2ad099711..0a7c48aab 100644
--- a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json
+++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json
@@ -3,7 +3,7 @@
"files": [
{
"errors": "",
- "filename": "/home/tallison/data/jfk/oswald/104-10263-10202.pdf",
+ "filename": "something.pdf",
"filesize": 810825,
"matches": [
{
diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json
new file mode 100644
index 000000000..49baf8dd6
--- /dev/null
+++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json
@@ -0,0 +1 @@
+{"siegfried":"1.9.5","scandate":"2022-11-03T13:12:46-04:00","signature":"default.sig","created":"2022-09-12T23:45:48+02:00","identifiers":[{"name":"pronom","details":"DROID_SignatureFile_V108.xml; container-signature-20220905.xml"}],"files":[{"filename":"tst.txt","filesize": 0,"modified":"2022-11-03T13:12:41-04:00","errors": "empty source","matches": [{"ns":"pronom","id":"x-fmt/111","format":"Plain Text File","version":"","mime":"text/plain","basis":"extension match txt","warning":"match [...]
\ No newline at end of file
diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json
new file mode 100644
index 000000000..b83b03749
--- /dev/null
+++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json
@@ -0,0 +1 @@
+{"siegfried":"1.9.5","scandate":"2022-11-03T13:28:19-04:00","signature":"default.sig","created":"2022-09-12T23:45:48+02:00","identifiers":[{"name":"pronom","details":"DROID_SignatureFile_V108.xml; container-signature-20220905.xml"}],"files":[{"filename":"pub1859-1-truncated.pdf","filesize": 159315,"modified":"2022-07-26T11:26:07-04:00","errors": "","matches": [{"ns":"pronom","id":"UNKNOWN","format":"","version":"","mime":"","basis":"","warning":"no match; possibilities based on extension [...]
\ No newline at end of file