You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/11/28 19:00:13 UTC

[tika] branch main updated: TIKA-3939 -- ensure digesting even with fallback parser (#825)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new d295f8eea TIKA-3939 -- ensure digesting even with fallback parser (#825)
d295f8eea is described below

commit d295f8eea557f0e26588224f2cc74921940664ea
Author: Tim Allison <ta...@apache.org>
AuthorDate: Mon Nov 28 14:00:06 2022 -0500

    TIKA-3939 -- ensure digesting even with fallback parser (#825)
---
 CHANGES.txt                                        |  3 ++
 .../org/apache/tika/parser/AutoDetectParser.java   | 21 +++++++++++-
 .../tika/parser/AutoDetectParserConfigTest.java    | 17 ++++++++++
 .../configs/tika-config-digests-pdf-only.xml       | 37 ++++++++++++++++++++++
 4 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index ae888f291..630b3c3f0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.6.1 - ???
 
+   * Fix bug that prevented digests when the fallback/EmptyParser
+     was called (TIKA-3939).
+
    * Remove log4j 1.2.x (and slf4j-log4j12 which now redirects to slf4j-reload4j) from
      all modules (TIKA-3935).
 
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index b8a0cb8aa..12c0e82ae 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -90,8 +90,28 @@ public class AutoDetectParser extends CompositeParser {
 
     public AutoDetectParser(TikaConfig config) {
         super(config.getMediaTypeRegistry(), getParser(config));
+        setFallback(buildFallbackParser(config));
         setDetector(config.getDetector());
         setAutoDetectParserConfig(config.getAutoDetectParserConfig());
+
+    }
+
+    private static Parser buildFallbackParser(TikaConfig config) {
+        Parser fallback = null;
+        Parser p = config.getParser();
+        if (p instanceof DefaultParser) {
+            fallback = ((DefaultParser)p).getFallback();
+        } else {
+            fallback = new EmptyParser();
+        }
+
+        if (config.getAutoDetectParserConfig().getDigesterFactory() == null) {
+            return fallback;
+        } else {
+            return new DigestingParser(fallback,
+                    config.getAutoDetectParserConfig().getDigesterFactory().build());
+        }
+
     }
 
     private static Parser getParser(TikaConfig config) {
@@ -144,7 +164,6 @@ public class AutoDetectParser extends CompositeParser {
         TemporaryResources tmp = new TemporaryResources();
         try {
             TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
-
             //figure out if we should spool to disk
             maybeSpool(tis, autoDetectParserConfig, metadata);
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 96b213a68..2a5dbf2b9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -121,4 +121,21 @@ public class AutoDetectParserConfigTest extends TikaTest {
         assertEquals("90a8b249a6d6b6cb127c59e01cef3aaa",
                 metadataList.get(6).get("X-TIKA:digest:MD5"));
     }
+
+    @Test
+    public void testDigestsEmptyParser() throws Exception {
+        //TIKA-3939 -- ensure that digesting happens even with EmptyParser
+        TikaConfig tikaConfig = null;
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-digests-pdf-only.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("testPDF.pdf", p);
+        assertEquals(1, metadataList.size());
+        assertEquals("4ef0d3bdb12ba603f4caf7d2e2c6112e",
+                metadataList.get(0).get("X-TIKA:digest:MD5"));
+        assertEquals("org.apache.tika.parser.EmptyParser",
+                metadataList.get(0).get("X-TIKA:Parsed-By"));
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
new file mode 100644
index 000000000..8b8b12ac9
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser">
+      <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+    </parser>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>1000000</spoolToDisk>
+      <outputThreshold>1000000</outputThreshold>
+    </params>
+    <digesterFactory
+        class="org.apache.tika.parser.digestutils.CommonsDigesterFactory">
+      <params>
+        <markLimit>100000</markLimit>
+        <algorithmString>sha256:32,md5</algorithmString>
+      </params>
+    </digesterFactory>
+  </autoDetectParserConfig>
+</properties>