You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/28 19:00:13 UTC
[tika] branch main updated: TIKA-3939 -- ensure digesting even with fallback parser (#825)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new d295f8eea TIKA-3939 -- ensure digesting even with fallback parser (#825)
d295f8eea is described below
commit d295f8eea557f0e26588224f2cc74921940664ea
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Nov 28 14:00:06 2022 -0500
TIKA-3939 -- ensure digesting even with fallback parser (#825)
---
CHANGES.txt | 3 ++
.../org/apache/tika/parser/AutoDetectParser.java | 21 +++++++++++-
.../tika/parser/AutoDetectParserConfigTest.java | 17 ++++++++++
.../configs/tika-config-digests-pdf-only.xml | 37 ++++++++++++++++++++++
4 files changed, 77 insertions(+), 1 deletion(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index ae888f291..630b3c3f0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.6.1 - ???
+ * Fix bug that prevented digests when the fallback/EmptyParser
+ was called (TIKA-3939).
+
* Remove log4j 1.2.x (and slf4j-log4j12 which now redirects to slf4j-reload4j) from
all modules (TIKA-3935).
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index b8a0cb8aa..12c0e82ae 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -90,8 +90,28 @@ public class AutoDetectParser extends CompositeParser {
public AutoDetectParser(TikaConfig config) {
super(config.getMediaTypeRegistry(), getParser(config));
+ setFallback(buildFallbackParser(config));
setDetector(config.getDetector());
setAutoDetectParserConfig(config.getAutoDetectParserConfig());
+
+ }
+
+ private static Parser buildFallbackParser(TikaConfig config) {
+ Parser fallback = null;
+ Parser p = config.getParser();
+ if (p instanceof DefaultParser) {
+ fallback = ((DefaultParser)p).getFallback();
+ } else {
+ fallback = new EmptyParser();
+ }
+
+ if (config.getAutoDetectParserConfig().getDigesterFactory() == null) {
+ return fallback;
+ } else {
+ return new DigestingParser(fallback,
+ config.getAutoDetectParserConfig().getDigesterFactory().build());
+ }
+
}
private static Parser getParser(TikaConfig config) {
@@ -144,7 +164,6 @@ public class AutoDetectParser extends CompositeParser {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
-
//figure out if we should spool to disk
maybeSpool(tis, autoDetectParserConfig, metadata);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 96b213a68..2a5dbf2b9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -121,4 +121,21 @@ public class AutoDetectParserConfigTest extends TikaTest {
assertEquals("90a8b249a6d6b6cb127c59e01cef3aaa",
metadataList.get(6).get("X-TIKA:digest:MD5"));
}
+
+ @Test
+ public void testDigestsEmptyParser() throws Exception {
+ //TIKA-3939 -- ensure that digesting happens even with EmptyParser
+ TikaConfig tikaConfig = null;
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-digests-pdf-only.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p);
+ assertEquals(1, metadataList.size());
+ assertEquals("4ef0d3bdb12ba603f4caf7d2e2c6112e",
+ metadataList.get(0).get("X-TIKA:digest:MD5"));
+ assertEquals("org.apache.tika.parser.EmptyParser",
+ metadataList.get(0).get("X-TIKA:Parsed-By"));
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
new file mode 100644
index 000000000..8b8b12ac9
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>1000000</spoolToDisk>
+ <outputThreshold>1000000</outputThreshold>
+ </params>
+ <digesterFactory
+ class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
+ <params>
+ <markLimit>100000</markLimit>
+ <algorithmString>sha256:32,md5</algorithmString>
+ </params>
+ </digesterFactory>
+ </autoDetectParserConfig>
+</properties>