You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/20 18:55:22 UTC

[tika] branch main updated: TIKA-3723 -- Allow configurability of the ContentHandlerDecorator used by the AutoDetectParser.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 7232fd4a0 TIKA-3723 -- Allow configurability of the ContentHandlerDecorator used by the AutoDetectParser.
7232fd4a0 is described below

commit 7232fd4a0dc99c5407c319eba86b436ad1d606e1
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 20 14:55:06 2022 -0400

    TIKA-3723 -- Allow configurability of the ContentHandlerDecorator used by the AutoDetectParser.
---
 CHANGES.txt                                        | 13 +++--
 .../org/apache/tika/parser/AutoDetectParser.java   |  2 +
 .../apache/tika/parser/AutoDetectParserConfig.java | 27 ++++++++-
 .../tika/sax/ContentHandlerDecoratorFactory.java   | 28 +++++++++
 .../tika/parser/AutoDetectParserConfigTest.java    | 67 ++++++++++++++++++++++
 .../org/apache/tika/parser/pkg/ZipParserTest.java  | 25 --------
 .../UpcasingContentHandlerDecoratorFactory.java    | 37 ++++++++++++
 ...ka-config-upcasing-custom-handler-decorator.xml | 29 ++++++++++
 8 files changed, 197 insertions(+), 31 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 98be3ad79..08a37b892 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,21 +6,24 @@ Release 2.4.0 - ???
      https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
      for the dependencies that must be provided at run-time (TIKA-3676).
 
-   * Add MetadataWriteFilter capability to improve memory profile in
-     Metadata objects (TIKA-3695).
-
    * Allow specification of fetcherName and fetchKey via query parameters
      in request URI in tika-server (TIKA-3714).
 
    * Add basic parsers for WARC and WACZ in tika-parsers-standard (TIKA-3697).
 
-   * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
+   * Add MetadataWriteFilter capability to improve memory profile in
+     Metadata objects (TIKA-3695).
 
-   * Add a fetcher and emitter for Azure blob storage (TIKA-3707).
+   * Allow configurability of the ContentHandlerDecorator used
+     by the AutoDetectParser (TIKA-3723).
 
    * Allow configurability of the EmbeddedDocumentExtractor used
      by the AutoDetectParser (TIKA-3711).
 
+   * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
+
+   * Add a fetcher and emitter for Azure blob storage (TIKA-3707).
+
    * Add detection for files encrypted by Microsoft's Rights Management Service
      (TIKA-3666).
 
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index f30809e6d..6d9ded05c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -166,6 +166,8 @@ public class AutoDetectParser extends CompositeParser {
                 }
                 tis.reset();
             }
+            handler = autoDetectParserConfig.getContentHandlerDecoratorFactory()
+                    .decorate(handler, metadata);
             // TIKA-216: Zip bomb prevention
             SecureContentHandler sch =
                     handler != null ?
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index d4702a512..267915b42 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -26,6 +26,7 @@ import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
 import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
+import org.apache.tika.sax.ContentHandlerDecoratorFactory;
 
 /**
  * This config object can be used to tune how conservative we want to be
@@ -35,8 +36,10 @@ import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
  */
 public class AutoDetectParserConfig extends ConfigBase implements Serializable {
 
-    public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig();
+    private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY
+            = (contentHandler, metadata) -> contentHandler;
 
+    public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig();
 
     public static AutoDetectParserConfig load(Element element)
             throws TikaConfigException, IOException {
@@ -77,6 +80,9 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
     private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory =
             new ParsingEmbeddedDocumentExtractorFactory();
 
+    private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory =
+            NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
+
     /**
      *  Creates a SecureContentHandlerConfig using the passed in parameters.
      *
@@ -156,5 +162,24 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
     public EmbeddedDocumentExtractorFactory getEmbeddedDocumentExtractorFactory() {
         return embeddedDocumentExtractorFactory;
     }
+
+    public void setContentHandlerDecoratorFactory(ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) {
+        this.contentHandlerDecoratorFactory = contentHandlerDecoratorFactory;
+    }
+
+    public ContentHandlerDecoratorFactory getContentHandlerDecoratorFactory() {
+        return contentHandlerDecoratorFactory;
+    }
+
+    @Override
+    public String toString() {
+        return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" +
+                outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio +
+                ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" +
+                maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
+                metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" +
+                embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" +
+                contentHandlerDecoratorFactory + '}';
+    }
 }
 
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
new file mode 100644
index 000000000..05f7045f3
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.Serializable;
+
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.metadata.Metadata;
+
+public interface ContentHandlerDecoratorFactory extends Serializable {
+
+    ContentHandler decorate(ContentHandler contentHandler, Metadata metadata);
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
new file mode 100644
index 000000000..2e84fa5d4
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
+
+public class AutoDetectParserConfigTest extends TikaTest {
+
+    @Test
+    public void testConfiguringEmbeddedDocExtractor() throws Exception {
+
+        TikaConfig tikaConfig = null;
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-no-names.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        String xml = getXML("testEmbedded.zip", p).xml;
+        assertNotContained("<h1>image3.jpg</h1>", xml);
+
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-with-names.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        p = new AutoDetectParser(tikaConfig);
+        xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+        assertContains("<h1>image3.jpg</h1>", xml);
+    }
+
+    @Test
+    public void testContentHandlerDecoratorFactory() throws Exception {
+        TikaConfig tikaConfig = null;
+        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+                "/configs/tika-config-upcasing-custom-handler-decorator.xml")) {
+            tikaConfig = new TikaConfig(is);
+        }
+        Parser p = new AutoDetectParser(tikaConfig);
+        List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+        Metadata pdfMetadata1 = metadataList.get(4);
+        assertContains("APACHE TIKA", pdfMetadata1.get(TikaCoreProperties.TIKA_CONTENT));
+        Metadata pdfMetadata2 = metadataList.get(5);
+        assertContains("HELLO WORLD", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT));
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 8ed307487..9f9f71357 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -28,14 +28,10 @@ import java.util.Set;
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
 
-import org.apache.tika.config.TikaConfig;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
 import org.apache.tika.sax.BodyContentHandler;
 
 /**
@@ -94,27 +90,6 @@ public class ZipParserTest extends AbstractPkgTest {
         assertTrue(relIDs.allRelIDs.contains("test2.txt"));
     }
 
-    @Test
-    public void testConfiguringEmbeddedDocExtractor() throws Exception {
-
-        TikaConfig tikaConfig = null;
-        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
-                "/configs/tika-config-no-names.xml")) {
-            tikaConfig = new TikaConfig(is);
-        }
-        Parser p = new AutoDetectParser(tikaConfig);
-        String xml = getXML("testEmbedded.zip", p).xml;
-        assertNotContained("<h1>image3.jpg</h1>", xml);
-
-        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
-                "/configs/tika-config-with-names.xml")) {
-            tikaConfig = new TikaConfig(is);
-        }
-        p = new AutoDetectParser(tikaConfig);
-        xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
-        assertContains("<h1>image3.jpg</h1>", xml);
-    }
-
     @Test
     public void testZipEncrypted() throws Exception {
         List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
new file mode 100644
index 000000000..feec91007
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.Locale;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.metadata.Metadata;
+
+public class UpcasingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
+    @Override
+    public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+        return new ContentHandlerDecorator(contentHandler) {
+            @Override
+            public void characters(char[] ch, int start, int length) throws SAXException {
+                String content = new String(ch, start, length).toUpperCase(Locale.US);
+                contentHandler.characters(content.toCharArray(), start, length);
+            }
+        };
+    }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.xml
new file mode 100644
index 000000000..dabb47ded
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>123450</spoolToDisk>
+      <outputThreshold>678900</outputThreshold>
+    </params>
+    <contentHandlerDecoratorFactory class="org.apache.tika.sax.UpcasingContentHandlerDecoratorFactory"/>
+  </autoDetectParserConfig>
+</properties>