You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/06 11:34:35 UTC

[tika] branch main updated (fd9cb684c -> 4fad47072)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


    from fd9cb684c update CHANGES.txt to include new /eval endpoint
     new bd157e5b2 TIKA-3675
     new 4fad47072 TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        |  3 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |  7 ++++
 .../org/apache/tika/parser/CompositeParser.java    | 39 +++++++++++++++++++++-
 .../java/org/apache/tika/utils/ParserUtils.java    | 15 +++++++--
 tika-parent/pom.xml                                |  2 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  2 +-
 .../tika/parser/microsoft/rtf/RTFParserTest.java   | 14 ++++++++
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  5 +++
 .../tika/server/standard/TikaResourceTest.java     |  5 +++
 9 files changed, 86 insertions(+), 6 deletions(-)


[tika] 02/02: TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4fad47072a47d477cbf68ad6400069c88c477c0c
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 6 07:34:26 2022 -0400

    TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.
---
 CHANGES.txt                                        |  3 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |  7 ++++
 .../org/apache/tika/parser/CompositeParser.java    | 39 +++++++++++++++++++++-
 .../java/org/apache/tika/utils/ParserUtils.java    | 15 +++++++--
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  2 +-
 .../tika/parser/microsoft/rtf/RTFParserTest.java   | 14 ++++++++
 .../tika/parser/ocr/TesseractOCRParserTest.java    |  5 +++
 .../tika/server/standard/TikaResourceTest.java     |  5 +++
 8 files changed, 85 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 1d878cad5..bc49d9cd3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -18,6 +18,9 @@ Release 2.4.0 - ???
    * Add an optional /eval endpoint for tika-eval profile or compare
      capabilities in tika-server (TIKA-3689).
 
+   * Add a Parsed-By-Full-Set metadata item to record all parsers that processed
+     a file (TIKA-3716).
+
    * Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
 
    * Various dependency upgrades, including POI, dl4j, gson, jackson,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 0681c724c..bf6221e35 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -103,6 +103,13 @@ public interface TikaCoreProperties {
     Property TIKA_META_EXCEPTION_EMBEDDED_STREAM =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception");
     Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By");
+
+    /**
+     * Use this to store a record of all parsers that touched a given file
+     * in the container file's metadata.
+     */
+    Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
+
     String RESOURCE_NAME_KEY = "resourceName";
     String PROTECTED = "protected";
     String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index b64c35cb0..6e4f53ec5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -280,11 +281,19 @@ public class CompositeParser extends AbstractParser {
                       ParseContext context) throws IOException, SAXException, TikaException {
         Parser parser = getParser(metadata, context);
         TemporaryResources tmp = new TemporaryResources();
+        ParserRecord parserRecord = context.get(ParserRecord.class);
+        if (parserRecord == null) {
+            parserRecord = new ParserRecord();
+            context.set(ParserRecord.class, parserRecord);
+        }
         try {
             TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
             TaggedContentHandler taggedHandler =
                     handler != null ? new TaggedContentHandler(handler) : null;
-            ParserUtils.recordParserDetails(parser, metadata);
+            String parsreClassName = ParserUtils.getParserClassname(parser);
+            parserRecord.add(parsreClassName);
+            ParserUtils.recordParserDetails(parsreClassName, metadata);
+            parserRecord.beforeParse();
             try {
                 parser.parse(taggedStream, taggedHandler, metadata, context);
             } catch (SecurityException e) {
@@ -304,7 +313,35 @@ public class CompositeParser extends AbstractParser {
             }
         } finally {
             tmp.dispose();
+            parserRecord.afterParse();
+            if (parserRecord.getDepth() == 0) {
+                metadata.set(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET, parserRecord.getParsers());
+            }
         }
     }
 
+    private static class ParserRecord {
+        int depth = 0;
+        Set<String> parsers = new LinkedHashSet<>();
+
+        void beforeParse() {
+            depth++;
+        }
+
+        void afterParse() {
+            depth--;
+        }
+
+        int getDepth() {
+            return depth;
+        }
+
+        String[] getParsers() {
+            return parsers.toArray(new String[0]);
+        }
+
+        void add(String parserClass) {
+            parsers.add(parserClass);
+        }
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index c1400fd20..0ad53abd4 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -77,12 +77,21 @@ public class ParserUtils {
      */
     public static void recordParserDetails(Parser parser, Metadata metadata) {
         String className = getParserClassname(parser);
+        recordParserDetails(className, metadata);
+    }
+
+    /**
+     * Records details of the {@link Parser} used to the {@link Metadata},
+     * typically wanted where multiple parsers could be picked between
+     * or used.
+     */
+    public static void recordParserDetails(String parserClassName, Metadata metadata) {
         String[] parsedBys = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
         if (parsedBys == null || parsedBys.length == 0) {
-            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, className);
-        } else if (Arrays.stream(parsedBys).noneMatch(className::equals)) {
+            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
+        } else if (Arrays.stream(parsedBys).noneMatch(parserClassName::equals)) {
             //only add parser once
-            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, className);
+            metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
         }
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index a20111fb4..48238267e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -234,7 +234,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("true", metadata.get("pdf:encrypted"));
         //pdf:encrypted, X-Parsed-By and Content-Type
-        assertEquals(3, metadata.names().length, "very little metadata should be parsed");
+        assertEquals(4, metadata.names().length, "very little metadata should be parsed");
         assertEquals(0, handler.toString().length());
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 9beb9d1ce..4cfb09bc6 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -87,6 +87,20 @@ public class RTFParserTest extends TikaTest {
         }
         assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
                 metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+        List<String> parsedByFullSet =
+                Arrays.asList(metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET));
+
+        assertContains("org.apache.tika.parser.DefaultParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.rtf.RTFParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.OfficeParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.EMFParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.WMFParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.csv.TextAndCSVParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.microsoft.ooxml.OOXMLParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.pkg.PackageParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.html.HtmlParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.image.JpegParser", parsedByFullSet);
+        assertContains("org.apache.tika.parser.pdf.PDFParser", parsedByFullSet);
     }
 
     //TIKA-1010 test regular (not "embedded") images/picts
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 677f612ff..d57d0e9aa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
+import java.util.Arrays;
 import java.util.List;
 
 import org.junit.jupiter.api.Disabled;
@@ -174,6 +175,10 @@ public class TesseractOCRParserTest extends TikaTest {
         //test at least one value
         assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
 
+        //make sure that tesseract is showing up in the full set of "parsed bys"
+        assertContains(TesseractOCRParser.class.getName(),
+                Arrays.asList(metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET)));
+
         return contents.toString();
     }
 
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index 4c29b3185..a42c80f6f 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -28,6 +28,7 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 import javax.ws.rs.ProcessingException;
@@ -45,6 +46,7 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
 import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.TikaTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
@@ -571,6 +573,9 @@ public class TikaResourceTest extends CXFTestBase {
         assertContains("General Congress", metadata.get(TikaCoreProperties.TIKA_CONTENT));
         assertNotFound("<p", metadata.get(TikaCoreProperties.TIKA_CONTENT));
         assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+        //test that embedded parsers are appearing in full set of "parsed bys"
+        TikaTest.assertContains("org.apache.tika.parser.microsoft.EMFParser",
+                Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET)));
     }
 
     @Test


[tika] 01/02: TIKA-3675

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit bd157e5b2ae7ed9079721f6c89ad3eb9be9867d5
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 6 07:30:39 2022 -0400

    TIKA-3675
---
 tika-parent/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 99b9d5d49..ae092c535 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -370,7 +370,7 @@
     <!-- we'll need to stay on 1.7 until we're java modularized ? -->
     <slf4j.version>1.7.36</slf4j.version>
     <solrj.version>8.11.1</solrj.version>
-    <spring.version>5.3.17</spring.version>
+    <spring.version>5.3.18</spring.version>
     <sqlite.version>3.36.0.3</sqlite.version>
     <tagsoup.version>1.2.1</tagsoup.version>
     <test.containers.version>1.16.3</test.containers.version>