You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/06 11:34:37 UTC

[tika] 02/02: TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4fad47072a47d477cbf68ad6400069c88c477c0c
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 6 07:34:26 2022 -0400

    TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.
---
 CHANGES.txt                                        |  3 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |  7 ++++
 .../org/apache/tika/parser/CompositeParser.java    | 39 +++++++++++++++++++++-
 .../java/org/apache/tika/utils/ParserUtils.java    | 15 +++++++--
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  2 +-
 .../tika/parser/microsoft/rtf/RTFParserTest.java   | 14 ++++++++
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  5 +++
 .../tika/server/standard/TikaResourceTest.java     |  5 +++
 8 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 1d878cad5..bc49d9cd3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -18,6 +18,9 @@ Release 2.4.0 - ???
    * Add an optional /eval endpoint for tika-eval profile or compare
      capabilities in tika-server (TIKA-3689).
 
+   * Add a Parsed-By-Full-Set metadata item to record all parsers that processed
+     a file (TIKA-3716).
+
    * Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
 
    * Various dependency upgrades, including POI, dl4j, gson, jackson,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 0681c724c..bf6221e35 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -103,6 +103,13 @@ public interface TikaCoreProperties {
     Property TIKA_META_EXCEPTION_EMBEDDED_STREAM =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception");
     Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By");
+
+    /**
+     * Use this to store a record of all parsers that touched a given file
+     * in the container file's metadata.
+     */
+    Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
+
     String RESOURCE_NAME_KEY = "resourceName";
     String PROTECTED = "protected";
     String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index b64c35cb0..6e4f53ec5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -280,11 +281,19 @@ public class CompositeParser extends AbstractParser {
                       ParseContext context) throws IOException, SAXException, TikaException {
         Parser parser = getParser(metadata, context);
         TemporaryResources tmp = new TemporaryResources();
+        ParserRecord parserRecord = context.get(ParserRecord.class);
+        if (parserRecord == null) {
+            parserRecord = new ParserRecord();
+            context.set(ParserRecord.class, parserRecord);
+        }
         try {
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             TaggedContentHandler taggedHandler =
                     handler != null ? new TaggedContentHandler(handler) : null;
-            ParserUtils.recordParserDetails(parser, metadata);
+            String parsreClassName = ParserUtils.getParserClassname(parser);
+            parserRecord.add(parsreClassName);
+            ParserUtils.recordParserDetails(parsreClassName, metadata);
+            parserRecord.beforeParse();
             try {
                 parser.parse(taggedStream, taggedHandler, metadata, context);
             } catch (SecurityException e) {
@@ -304,7 +313,35 @@ public class CompositeParser extends AbstractParser {
             }
         } finally {
             tmp.dispose();
+            parserRecord.afterParse();
+            if (parserRecord.getDepth() == 0) {
+                metadata.set(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET, parserRecord.getParsers());
+            }
         }
     }
 
+    private static class ParserRecord {
+        int depth = 0;
+        Set<String> parsers = new LinkedHashSet<>();
+
+        void beforeParse() {
+            depth++;
+        }
+
+        void afterParse() {
+            depth--;
+        }
+
+        int getDepth() {
+            return depth;
+        }
+
+        String[] getParsers() {
+            return parsers.toArray(new String[0]);
+        }
+
+        void add(String parserClass) {
+            parsers.add(parserClass);
+        }
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index c1400fd20..0ad53abd4 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -77,12 +77,21 @@ public class ParserUtils {
      */
     public static void recordParserDetails(Parser parser, Metadata metadata) {
         String className = getParserClassname(parser);
+        recordParserDetails(className, metadata);
+    }
+
+    /**
+     * Records details of the {@link Parser} used to the {@link Metadata},
+     * typically wanted where multiple parsers could be picked between
+     * or used.
+     */
+    public static void recordParserDetails(String parserClassName, Metadata metadata) {
         String[] parsedBys = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
         if (parsedBys == null || parsedBys.length == 0) {
-            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, className);
-        } else if (Arrays.stream(parsedBys).noneMatch(className::equals)) {
+            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
+        } else if (Arrays.stream(parsedBys).noneMatch(parserClassName::equals)) {
             //only add parser once
-            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, className);
+            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
         }
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index a20111fb4..48238267e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -234,7 +234,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("true", metadata.get("pdf:encrypted"));
         //pdf:encrypted, X-Parsed-By and Content-Type
-        assertEquals(3, metadata.names().length, "very little metadata should be parsed");
+        assertEquals(4, metadata.names().length, "very little metadata should be parsed");
         assertEquals(0, handler.toString().length());
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 9beb9d1ce..4cfb09bc6 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -87,6 +87,20 @@ public class RTFParserTest extends TikaTest {
         }
         assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
                 metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+        List<String> parsedByFullSet =
+                Arrays.asList(metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET));
+
+        assertContains("org.apache.tika.parser.DefaultParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.rtf.RTFParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.OfficeParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.EMFParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.WMFParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.csv.TextAndCSVParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.ooxml.OOXMLParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.pkg.PackageParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.html.HtmlParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.image.JpegParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.pdf.PDFParser", parsedByFullSet);
     }
 
     //TIKA-1010 test regular (not "embedded") images/picts
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 677f612ff..d57d0e9aa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
+import java.util.Arrays;
 import java.util.List;
 
 import org.junit.jupiter.api.Disabled;
@@ -174,6 +175,10 @@ public class TesseractOCRParserTest extends TikaTest {
         //test at least one value
         assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
 
+        //make sure that tesseract is showing up in the full set of "parsed bys"
+        assertContains(TesseractOCRParser.class.getName(),
+                Arrays.asList(metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET)));
+
         return contents.toString();
     }
 
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index 4c29b3185..a42c80f6f 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -28,6 +28,7 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 import javax.ws.rs.ProcessingException;
@@ -45,6 +46,7 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -571,6 +573,9 @@ public class TikaResourceTest extends CXFTestBase {
         assertContains("General Congress", metadata.get(TikaCoreProperties.TIKA_CONTENT));
         assertNotFound("<p", metadata.get(TikaCoreProperties.TIKA_CONTENT));
         assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        //test that embedded parsers are appearing in full set of "parsed bys"
+        TikaTest.assertContains("org.apache.tika.parser.microsoft.EMFParser",
+                Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET)));
     }
 
     @Test