You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/06 11:34:37 UTC
[tika] 02/02: TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4fad47072a47d477cbf68ad6400069c88c477c0c
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 6 07:34:26 2022 -0400
TIKA-3716 -- add a Parsed-By-Full-Set metadata key to record all parsers that touched a file.
---
CHANGES.txt | 3 ++
.../apache/tika/metadata/TikaCoreProperties.java | 7 ++++
.../org/apache/tika/parser/CompositeParser.java | 39 +++++++++++++++++++++-
.../java/org/apache/tika/utils/ParserUtils.java | 15 +++++++--
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 14 ++++++++
.../tika/parser/ocr/TesseractOCRParserTest.java | 5 +++
.../tika/server/standard/TikaResourceTest.java | 5 +++
8 files changed, 85 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 1d878cad5..bc49d9cd3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -18,6 +18,9 @@ Release 2.4.0 - ???
* Add an optional /eval endpoint for tika-eval profile or compare
capabilities in tika-server (TIKA-3689).
+ * Add a Parsed-By-Full-Set metadata item to record all parsers that processed
+ a file (TIKA-3716).
+
* Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
* Various dependency upgrades, including POI, dl4j, gson, jackson,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 0681c724c..bf6221e35 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -103,6 +103,13 @@ public interface TikaCoreProperties {
Property TIKA_META_EXCEPTION_EMBEDDED_STREAM =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception");
Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By");
+
+ /**
+ * Use this to store a record of all parsers that touched a given file
+ * in the container file's metadata.
+ */
+ Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
+
String RESOURCE_NAME_KEY = "resourceName";
String PROTECTED = "protected";
String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index b64c35cb0..6e4f53ec5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
+import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -280,11 +281,19 @@ public class CompositeParser extends AbstractParser {
ParseContext context) throws IOException, SAXException, TikaException {
Parser parser = getParser(metadata, context);
TemporaryResources tmp = new TemporaryResources();
+ ParserRecord parserRecord = context.get(ParserRecord.class);
+ if (parserRecord == null) {
+ parserRecord = new ParserRecord();
+ context.set(ParserRecord.class, parserRecord);
+ }
try {
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
- ParserUtils.recordParserDetails(parser, metadata);
+ String parsreClassName = ParserUtils.getParserClassname(parser);
+ parserRecord.add(parsreClassName);
+ ParserUtils.recordParserDetails(parsreClassName, metadata);
+ parserRecord.beforeParse();
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (SecurityException e) {
@@ -304,7 +313,35 @@ public class CompositeParser extends AbstractParser {
}
} finally {
tmp.dispose();
+ parserRecord.afterParse();
+ if (parserRecord.getDepth() == 0) {
+ metadata.set(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET, parserRecord.getParsers());
+ }
}
}
+ private static class ParserRecord {
+ int depth = 0;
+ Set<String> parsers = new LinkedHashSet<>();
+
+ void beforeParse() {
+ depth++;
+ }
+
+ void afterParse() {
+ depth--;
+ }
+
+ int getDepth() {
+ return depth;
+ }
+
+ String[] getParsers() {
+ return parsers.toArray(new String[0]);
+ }
+
+ void add(String parserClass) {
+ parsers.add(parserClass);
+ }
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index c1400fd20..0ad53abd4 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -77,12 +77,21 @@ public class ParserUtils {
*/
public static void recordParserDetails(Parser parser, Metadata metadata) {
String className = getParserClassname(parser);
+ recordParserDetails(className, metadata);
+ }
+
+ /**
+ * Records details of the {@link Parser} used to the {@link Metadata},
+ * typically wanted where multiple parsers could be picked between
+ * or used.
+ */
+ public static void recordParserDetails(String parserClassName, Metadata metadata) {
String[] parsedBys = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
if (parsedBys == null || parsedBys.length == 0) {
- metadata.add(TikaCoreProperties.TIKA_PARSED_BY, className);
- } else if (Arrays.stream(parsedBys).noneMatch(className::equals)) {
+ metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
+ } else if (Arrays.stream(parsedBys).noneMatch(parserClassName::equals)) {
//only add parser once
- metadata.add(TikaCoreProperties.TIKA_PARSED_BY, className);
+ metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index a20111fb4..48238267e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -234,7 +234,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("application/pdf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("true", metadata.get("pdf:encrypted"));
//pdf:encrypted, X-Parsed-By and Content-Type
- assertEquals(3, metadata.names().length, "very little metadata should be parsed");
+ assertEquals(4, metadata.names().length, "very little metadata should be parsed");
assertEquals(0, handler.toString().length());
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 9beb9d1ce..4cfb09bc6 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -87,6 +87,20 @@ public class RTFParserTest extends TikaTest {
}
assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
+ List<String> parsedByFullSet =
+ Arrays.asList(metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET));
+
+ assertContains("org.apache.tika.parser.DefaultParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.microsoft.rtf.RTFParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.microsoft.OfficeParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.microsoft.EMFParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.microsoft.WMFParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.csv.TextAndCSVParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.microsoft.ooxml.OOXMLParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.pkg.PackageParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.html.HtmlParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.image.JpegParser", parsedByFullSet);
+ assertContains("org.apache.tika.parser.pdf.PDFParser", parsedByFullSet);
}
//TIKA-1010 test regular (not "embedded") images/picts
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
index 677f612ff..d57d0e9aa 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/ocr/TesseractOCRParserTest.java
@@ -20,6 +20,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assumptions.assumeTrue;
+import java.util.Arrays;
import java.util.List;
import org.junit.jupiter.api.Disabled;
@@ -174,6 +175,10 @@ public class TesseractOCRParserTest extends TikaTest {
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
+ //make sure that tesseract is showing up in the full set of "parsed bys"
+ assertContains(TesseractOCRParser.class.getName(),
+ Arrays.asList(metadataList.get(0).getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET)));
+
return contents.toString();
}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
index 4c29b3185..a42c80f6f 100644
--- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaResourceTest.java
@@ -28,6 +28,7 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
+import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import javax.ws.rs.ProcessingException;
@@ -45,6 +46,7 @@ import org.apache.cxf.jaxrs.ext.multipart.MultipartBody;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.jupiter.api.Test;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -571,6 +573,9 @@ public class TikaResourceTest extends CXFTestBase {
assertContains("General Congress", metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertNotFound("<p", metadata.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("Microsoft Office Word", metadata.get(OfficeOpenXMLExtended.APPLICATION));
+ //test that embedded parsers are appearing in full set of "parsed bys"
+ TikaTest.assertContains("org.apache.tika.parser.microsoft.EMFParser",
+ Arrays.asList(metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY_FULL_SET)));
}
@Test