You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/03 17:37:34 UTC

[tika] branch main updated: TIKA-3915 -- extract errors field from siegfried; add test for warnings

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 1a33c7ec2 TIKA-3915 -- extract errors field from siegfried; add test for warnings
1a33c7ec2 is described below

commit 1a33c7ec287f7cbcfde399a74dc1428a16210a2d
Author: tballison <ta...@apache.org>
AuthorDate: Thu Nov 3 13:37:22 2022 -0400

    TIKA-3915 -- extract errors field from siegfried; add test for warnings
---
 .../tika/detect/siegfried/SiegfriedDetector.java   | 19 ++++++++++--
 .../detect/siegfried/TestSiegfriedJsonParsing.java | 35 ++++++++++++++++++++++
 .../src/test/resources/json/test-basic.json        |  2 +-
 .../src/test/resources/json/test-errors.json       |  1 +
 .../src/test/resources/json/test-warnings.json     |  1 +
 5 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
index 5a2c11079..fe42a9f3d 100644
--- a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
+++ b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java
@@ -69,6 +69,9 @@ public class SiegfriedDetector implements Detector {
     public static Property SIEGFRIED_IDENTIFIERS_DETAILS =
             Property.externalTextBag(SIEGFRIED_PREFIX + "identifiers_details");
 
+    public static Property SIEGFRIED_ERRORS =
+            Property.externalTextBag(SIEGFRIED_PREFIX + "errors");
+
     //TODO -- grab errors and warnings
 
     public static String ID = "id";
@@ -78,6 +81,8 @@ public class SiegfriedDetector implements Detector {
     public static String WARNING = "warning";
     public static String BASIS = "basis";
 
+    public static String ERRORS = "errors";
+
     private static final Logger LOGGER = LoggerFactory.getLogger(SiegfriedDetector.class);
     private static final long DEFAULT_TIMEOUT_MS = 6000;
     private static final String DEFAULT_SIEGFRIED_PATH = "sf";
@@ -203,8 +208,18 @@ public class SiegfriedDetector implements Detector {
         MediaType mt = MediaType.OCTET_STREAM;
         if (root.has("files")) {
             for (JsonNode file : root.get("files")) {
-                //TODO
-///                String errors = file.get("errors").asText("");
+
+                if (file.has(ERRORS)) {
+                    JsonNode errors = file.get(ERRORS);
+                    if (errors.isTextual()) {
+                        metadata.add(SIEGFRIED_ERRORS, file.get(ERRORS).asText());
+                    } else if (errors.isArray()) {
+                        //is this even possible?!
+                        for (JsonNode e : errors) {
+                            metadata.add(SIEGFRIED_ERRORS, e.asText());
+                        }
+                    }
+                }
                 for (JsonNode match : file.get("matches")) {
                     String ns = match.has("ns") ? match.get("ns").asText(StringUtils.EMPTY) :
                             StringUtils.EMPTY;
diff --git a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java
index c438d2c87..237eab51f 100644
--- a/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java
+++ b/tika-detectors/tika-detector-siegfried/src/test/java/org/apache/tika/detect/siegfried/TestSiegfriedJsonParsing.java
@@ -17,6 +17,8 @@
 package org.apache.tika.detect.siegfried;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
@@ -47,6 +49,39 @@ public class TestSiegfriedJsonParsing extends TikaTest {
 
     }
 
+    @Test
+    public void testErrors() throws Exception {
+        FileProcessResult fileProcessResult = load("test-errors.json");
+        Metadata metadata = new Metadata();
+        SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+        //debug(metadata);
+        assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+        assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+        assertEquals("x-fmt/111", metadata.get("sf:pronom:id"));
+        assertEquals("extension match txt", metadata.get("sf:pronom:basis"));
+        assertEquals("Plain Text File", metadata.get("sf:pronom:format"));
+        assertEquals("text/plain", metadata.get("sf:pronom:mime"));
+        assertNull(metadata.get("sf:pronom:version"));
+        assertEquals("empty source", metadata.get(SiegfriedDetector.SIEGFRIED_ERRORS));
+    }
+
+    @Test
+    public void testWarnings() throws Exception {
+        FileProcessResult fileProcessResult = load("test-warnings.json");
+        Metadata metadata = new Metadata();
+        SiegfriedDetector.processResult(fileProcessResult, metadata, false);
+        assertEquals("1.9.5", metadata.get(SiegfriedDetector.SIEGFRIED_VERSION));
+        assertEquals("default.sig", metadata.get(SiegfriedDetector.SIEGFRIED_SIGNATURE));
+        assertEquals("UNKNOWN", metadata.get("sf:pronom:id"));
+        assertNull(metadata.get("sf:pronom:basis"));
+        assertNull(metadata.get("sf:pronom:format"));
+        assertNull(metadata.get("sf:pronom:mime"));
+        assertNull(metadata.get("sf:pronom:version"));
+        assertTrue(metadata.get("sf:pronom:warning")
+                .startsWith("no match; possibilities based on extension are fmt/14, fmt/15, fmt/16, " +
+                        "fmt/17, fmt/18, fmt/19"));
+    }
+
 
 
 
diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json
index 2ad099711..0a7c48aab 100644
--- a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json
+++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-basic.json
@@ -3,7 +3,7 @@
   "files": [
     {
       "errors": "",
-      "filename": "/home/tallison/data/jfk/oswald/104-10263-10202.pdf",
+      "filename": "something.pdf",
       "filesize": 810825,
       "matches": [
         {
diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json
new file mode 100644
index 000000000..49baf8dd6
--- /dev/null
+++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-errors.json
@@ -0,0 +1 @@
+{"siegfried":"1.9.5","scandate":"2022-11-03T13:12:46-04:00","signature":"default.sig","created":"2022-09-12T23:45:48+02:00","identifiers":[{"name":"pronom","details":"DROID_SignatureFile_V108.xml; container-signature-20220905.xml"}],"files":[{"filename":"tst.txt","filesize": 0,"modified":"2022-11-03T13:12:41-04:00","errors": "empty source","matches": [{"ns":"pronom","id":"x-fmt/111","format":"Plain Text File","version":"","mime":"text/plain","basis":"extension match txt","warning":"match [...]
\ No newline at end of file
diff --git a/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json
new file mode 100644
index 000000000..b83b03749
--- /dev/null
+++ b/tika-detectors/tika-detector-siegfried/src/test/resources/json/test-warnings.json
@@ -0,0 +1 @@
+{"siegfried":"1.9.5","scandate":"2022-11-03T13:28:19-04:00","signature":"default.sig","created":"2022-09-12T23:45:48+02:00","identifiers":[{"name":"pronom","details":"DROID_SignatureFile_V108.xml; container-signature-20220905.xml"}],"files":[{"filename":"pub1859-1-truncated.pdf","filesize": 159315,"modified":"2022-07-26T11:26:07-04:00","errors": "","matches": [{"ns":"pronom","id":"UNKNOWN","format":"","version":"","mime":"","basis":"","warning":"no match; possibilities based on extension [...]
\ No newline at end of file