You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/14 16:09:09 UTC

[tika] branch main updated: TIKA-3792 -- only apply the handler decorator once for legacy xhtml processing of embedded documents

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 0ea65717a TIKA-3792 -- only apply the handler decorator once for legacy xhtml processing of embedded documents
0ea65717a is described below

commit 0ea65717a449c5d248b4f6dd980f783069358fe0
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 14 12:08:56 2022 -0400

    TIKA-3792 -- only apply the handler decorator once for legacy xhtml processing of embedded documents
---
 .../org/apache/tika/parser/AutoDetectParser.java   | 21 ++++++++++++--
 .../apache/tika/parser/AutoDetectParserConfig.java | 32 ++++++++++++++++------
 .../apache/tika/parser/RecursiveParserWrapper.java |  2 +-
 .../tika/sax/ContentHandlerDecoratorFactory.java   | 11 ++++++++
 .../tika/parser/AutoDetectParserConfigTest.java    | 32 ++++++++++++++++++++++
 ...=> DoublingContentHandlerDecoratorFactory.java} | 13 ++++++++-
 .../UpcasingContentHandlerDecoratorFactory.java    |  7 +++++
 ...ka-config-doubling-custom-handler-decorator.xml | 29 ++++++++++++++++++++
 8 files changed, 134 insertions(+), 13 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 054e71ea3..9ae5b8522 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -154,8 +154,7 @@ public class AutoDetectParser extends CompositeParser {
                 }
                 tis.reset();
             }
-            handler = autoDetectParserConfig.getContentHandlerDecoratorFactory()
-                    .decorate(handler, metadata);
+            handler = decorateHandler(handler, metadata, context, autoDetectParserConfig);
             // TIKA-216: Zip bomb prevention
             SecureContentHandler sch =
                     handler != null ?
@@ -176,6 +175,24 @@ public class AutoDetectParser extends CompositeParser {
         }
     }
 
+    private ContentHandler decorateHandler(ContentHandler handler,
+                                           Metadata metadata, ParseContext context,
+                                           AutoDetectParserConfig autoDetectParserConfig) {
+        if (context.get(RecursiveParserWrapper.RecursivelySecureContentHandler.class) != null) {
+            //using the recursiveparserwrapper. we should decorate this handler
+            return autoDetectParserConfig
+                    .getContentHandlerDecoratorFactory()
+                    .decorate(handler, metadata, context);
+        }
+        ParseRecord parseRecord = context.get(ParseRecord.class);
+        if (parseRecord == null || parseRecord.getDepth() == 0) {
+            return autoDetectParserConfig.getContentHandlerDecoratorFactory()
+                    .decorate(handler, metadata, context);
+        }
+        //else do not decorate
+        return handler;
+    }
+
     private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig,
                             Metadata metadata) throws IOException {
         if (! tis.hasFile() && //if there's already a file, stop now
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 267915b42..8980dbce5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -20,11 +20,13 @@ import java.io.IOException;
 import java.io.Serializable;
 
 import org.w3c.dom.Element;
+import org.xml.sax.ContentHandler;
 
 import org.apache.tika.config.ConfigBase;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
 import org.apache.tika.sax.ContentHandlerDecoratorFactory;
 
@@ -36,16 +38,26 @@ import org.apache.tika.sax.ContentHandlerDecoratorFactory;
  */
 public class AutoDetectParserConfig extends ConfigBase implements Serializable {
 
-    private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY
-            = (contentHandler, metadata) -> contentHandler;
+    private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY =
+            new ContentHandlerDecoratorFactory() {
+                @Override
+                public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+                    return contentHandler;
+                }
+
+                @Override
+                public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+                                               ParseContext parseContext) {
+                    return contentHandler;
+                }
+            };
 
     public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig();
 
     public static AutoDetectParserConfig load(Element element)
             throws TikaConfigException, IOException {
-        return AutoDetectParserConfig.buildSingle(
-                "autoDetectParserConfig", AutoDetectParserConfig.class, element,
-                AutoDetectParserConfig.DEFAULT);
+        return AutoDetectParserConfig.buildSingle("autoDetectParserConfig",
+                AutoDetectParserConfig.class, element, AutoDetectParserConfig.DEFAULT);
     }
 
     /**
@@ -84,7 +96,7 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
             NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
 
     /**
-     *  Creates a SecureContentHandlerConfig using the passed in parameters.
+     * Creates a SecureContentHandlerConfig using the passed in parameters.
      *
      * @param spoolToDisk
      * @param outputThreshold          SecureContentHandler - character output threshold.
@@ -150,10 +162,11 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
         return this.metadataWriteFilterFactory;
     }
 
-    public void setMetadataWriteFilterFactory(MetadataWriteFilterFactory metadataWriteFilterFactory) {
+    public void setMetadataWriteFilterFactory(
+            MetadataWriteFilterFactory metadataWriteFilterFactory) {
         this.metadataWriteFilterFactory = metadataWriteFilterFactory;
     }
-    
+
     public void setEmbeddedDocumentExtractorFactory(
             EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory) {
         this.embeddedDocumentExtractorFactory = embeddedDocumentExtractorFactory;
@@ -163,7 +176,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
         return embeddedDocumentExtractorFactory;
     }
 
-    public void setContentHandlerDecoratorFactory(ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) {
+    public void setContentHandlerDecoratorFactory(
+            ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) {
         this.contentHandlerDecoratorFactory = contentHandlerDecoratorFactory;
     }
 
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 59db9b3f9..d660cc175 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -287,7 +287,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
         }
     }
 
-    private static class RecursivelySecureContentHandler extends SecureContentHandler {
+    static class RecursivelySecureContentHandler extends SecureContentHandler {
         private ContentHandler handler;
 
         //total allowable chars across all handlers
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
index 05f7045f3..52a513560 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
@@ -21,8 +21,19 @@ import java.io.Serializable;
 import org.xml.sax.ContentHandler;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
 
 public interface ContentHandlerDecoratorFactory extends Serializable {
 
+    /**
+     * @deprecated use {@link ContentHandlerDecoratorFactory#decorate(ContentHandler, Metadata, ParseContext)}
+     *   This will be removed in 2.5.0
+     * @param contentHandler
+     * @param metadata
+     * @return
+     */
+    @Deprecated
     ContentHandler decorate(ContentHandler contentHandler, Metadata metadata);
+    ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+                            ParseContext parseContext);
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 2e84fa5d4..38c0ea95f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -64,4 +64,36 @@ public class AutoDetectParserConfigTest extends TikaTest {
         Metadata pdfMetadata2 = metadataList.get(5);
         assertContains("HELLO WORLD", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT));
     }
+
+    @Test
+    public void testRecursiveContentHandlerDecoratorFactory() throws Exception {
+        TikaConfig tikaConfig = null;
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-doubling-custom-handler-decorator.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+        assertContainsCount("IMAGE2.EMF",
+                metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 2);
+        assertContainsCount("15.9.2007 11:02",
+                metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT), 2);
+        assertContainsCount("HELLO WORLD",
+                metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT), 4);
+    }
+
+    @Test
+    public void testXMLContentHandlerDecoratorFactory() throws Exception {
+        //test to make sure that the decorator is only applied once for
+        //legacy (e.g. not RecursiveParserWrapperHandler) parsing
+        TikaConfig tikaConfig = null;
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-doubling-custom-handler-decorator.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        String txt = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+        assertContainsCount("THE APACHE TIKA PROJECT WAS FORMALLY", txt, 2);
+        assertContainsCount("15.9.2007 11:02", txt, 2);
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java
similarity index 72%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java
index feec91007..cc71ca398 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java
@@ -22,15 +22,26 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+public class DoublingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
+    private static final char[] NEWLINE = new char[]{'\n'};
 
-public class UpcasingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
     @Override
     public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+        return decorate(contentHandler, metadata, new ParseContext());
+    }
+
+    @Override
+    public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+                                   ParseContext parseContext) {
         return new ContentHandlerDecorator(contentHandler) {
             @Override
             public void characters(char[] ch, int start, int length) throws SAXException {
                 String content = new String(ch, start, length).toUpperCase(Locale.US);
                 contentHandler.characters(content.toCharArray(), start, length);
+                contentHandler.characters(NEWLINE, 0, 1);
+                contentHandler.characters(content.toCharArray(), start, length);
             }
         };
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
index feec91007..f78aa29e7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
@@ -22,10 +22,17 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
 
 public class UpcasingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
     @Override
     public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+        return decorate(contentHandler, metadata, new ParseContext());
+    }
+
+    @Override
+    public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+                                   ParseContext parseContext) {
         return new ContentHandlerDecorator(contentHandler) {
             @Override
             public void characters(char[] ch, int start, int length) throws SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
new file mode 100644
index 000000000..a15cb86b6
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>123450</spoolToDisk>
+      <outputThreshold>678900</outputThreshold>
+    </params>
+    <contentHandlerDecoratorFactory class="org.apache.tika.sax.DoublingContentHandlerDecoratorFactory"/>
+  </autoDetectParserConfig>
+</properties>