You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/20 18:55:22 UTC
[tika] branch main updated: TIKA-3723 -- Allow configurability of the ContentHandlerDecorator used by the AutoDetectParser.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 7232fd4a0 TIKA-3723 -- Allow configurability of the ContentHandlerDecorator used by the AutoDetectParser.
7232fd4a0 is described below
commit 7232fd4a0dc99c5407c319eba86b436ad1d606e1
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 20 14:55:06 2022 -0400
TIKA-3723 -- Allow configurability of the ContentHandlerDecorator used by the AutoDetectParser.
---
CHANGES.txt | 13 +++--
.../org/apache/tika/parser/AutoDetectParser.java | 2 +
.../apache/tika/parser/AutoDetectParserConfig.java | 27 ++++++++-
.../tika/sax/ContentHandlerDecoratorFactory.java | 28 +++++++++
.../tika/parser/AutoDetectParserConfigTest.java | 67 ++++++++++++++++++++++
.../org/apache/tika/parser/pkg/ZipParserTest.java | 25 --------
.../UpcasingContentHandlerDecoratorFactory.java | 37 ++++++++++++
...ka-config-upcasing-custom-handler-decorator.xml | 29 ++++++++++
8 files changed, 197 insertions(+), 31 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 98be3ad79..08a37b892 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,21 +6,24 @@ Release 2.4.0 - ???
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
for the dependencies that must be provided at run-time (TIKA-3676).
- * Add MetadataWriteFilter capability to improve memory profile in
- Metadata objects (TIKA-3695).
-
* Allow specification of fetcherName and fetchKey via query parameters
in request URI in tika-server (TIKA-3714).
* Add basic parsers for WARC and WACZ in tika-parsers-standard (TIKA-3697).
- * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
+ * Add MetadataWriteFilter capability to improve memory profile in
+ Metadata objects (TIKA-3695).
- * Add a fetcher and emitter for Azure blob storage (TIKA-3707).
+ * Allow configurability of the ContentHandlerDecorator used
+ by the AutoDetectParser (TIKA-3723).
* Allow configurability of the EmbeddedDocumentExtractor used
by the AutoDetectParser (TIKA-3711).
+ * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
+
+ * Add a fetcher and emitter for Azure blob storage (TIKA-3707).
+
* Add detection for files encrypted by Microsoft's Rights Management Service
(TIKA-3666).
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index f30809e6d..6d9ded05c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -166,6 +166,8 @@ public class AutoDetectParser extends CompositeParser {
}
tis.reset();
}
+ handler = autoDetectParserConfig.getContentHandlerDecoratorFactory()
+ .decorate(handler, metadata);
// TIKA-216: Zip bomb prevention
SecureContentHandler sch =
handler != null ?
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index d4702a512..267915b42 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -26,6 +26,7 @@ import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
+import org.apache.tika.sax.ContentHandlerDecoratorFactory;
/**
* This config object can be used to tune how conservative we want to be
@@ -35,8 +36,10 @@ import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
*/
public class AutoDetectParserConfig extends ConfigBase implements Serializable {
- public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig();
+ private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY
+ = (contentHandler, metadata) -> contentHandler;
+ public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig();
public static AutoDetectParserConfig load(Element element)
throws TikaConfigException, IOException {
@@ -77,6 +80,9 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
private EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory =
new ParsingEmbeddedDocumentExtractorFactory();
+ private ContentHandlerDecoratorFactory contentHandlerDecoratorFactory =
+ NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
+
/**
* Creates a SecureContentHandlerConfig using the passed in parameters.
*
@@ -156,5 +162,24 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
public EmbeddedDocumentExtractorFactory getEmbeddedDocumentExtractorFactory() {
return embeddedDocumentExtractorFactory;
}
+
+ public void setContentHandlerDecoratorFactory(ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) {
+ this.contentHandlerDecoratorFactory = contentHandlerDecoratorFactory;
+ }
+
+ public ContentHandlerDecoratorFactory getContentHandlerDecoratorFactory() {
+ return contentHandlerDecoratorFactory;
+ }
+
+ @Override
+ public String toString() {
+ return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" +
+ outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio +
+ ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" +
+ maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
+ metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" +
+ embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" +
+ contentHandlerDecoratorFactory + '}';
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
new file mode 100644
index 000000000..05f7045f3
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.io.Serializable;
+
+import org.xml.sax.ContentHandler;
+
+import org.apache.tika.metadata.Metadata;
+
+public interface ContentHandlerDecoratorFactory extends Serializable {
+
+ ContentHandler decorate(ContentHandler contentHandler, Metadata metadata);
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
new file mode 100644
index 000000000..2e84fa5d4
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import java.io.InputStream;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
+
+public class AutoDetectParserConfigTest extends TikaTest {
+
+ @Test
+ public void testConfiguringEmbeddedDocExtractor() throws Exception {
+
+ TikaConfig tikaConfig = null;
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-no-names.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ String xml = getXML("testEmbedded.zip", p).xml;
+ assertNotContained("<h1>image3.jpg</h1>", xml);
+
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-with-names.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ p = new AutoDetectParser(tikaConfig);
+ xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+ assertContains("<h1>image3.jpg</h1>", xml);
+ }
+
+ @Test
+ public void testContentHandlerDecoratorFactory() throws Exception {
+ TikaConfig tikaConfig = null;
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-upcasing-custom-handler-decorator.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+ Metadata pdfMetadata1 = metadataList.get(4);
+ assertContains("APACHE TIKA", pdfMetadata1.get(TikaCoreProperties.TIKA_CONTENT));
+ Metadata pdfMetadata2 = metadataList.get(5);
+ assertContains("HELLO WORLD", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT));
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
index 8ed307487..9f9f71357 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/ZipParserTest.java
@@ -28,14 +28,10 @@ import java.util.Set;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
-import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.microsoft.ooxml.OOXMLParserTest;
import org.apache.tika.sax.BodyContentHandler;
/**
@@ -94,27 +90,6 @@ public class ZipParserTest extends AbstractPkgTest {
assertTrue(relIDs.allRelIDs.contains("test2.txt"));
}
- @Test
- public void testConfiguringEmbeddedDocExtractor() throws Exception {
-
- TikaConfig tikaConfig = null;
- try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
- "/configs/tika-config-no-names.xml")) {
- tikaConfig = new TikaConfig(is);
- }
- Parser p = new AutoDetectParser(tikaConfig);
- String xml = getXML("testEmbedded.zip", p).xml;
- assertNotContained("<h1>image3.jpg</h1>", xml);
-
- try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
- "/configs/tika-config-with-names.xml")) {
- tikaConfig = new TikaConfig(is);
- }
- p = new AutoDetectParser(tikaConfig);
- xml = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
- assertContains("<h1>image3.jpg</h1>", xml);
- }
-
@Test
public void testZipEncrypted() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testZipEncrypted.zip");
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
new file mode 100644
index 000000000..feec91007
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.sax;
+
+import java.util.Locale;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.metadata.Metadata;
+
+public class UpcasingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
+ @Override
+ public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+ return new ContentHandlerDecorator(contentHandler) {
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ String content = new String(ch, start, length).toUpperCase(Locale.US);
+ contentHandler.characters(content.toCharArray(), start, length);
+ }
+ };
+ }
+}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.xml
new file mode 100644
index 000000000..dabb47ded
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>123450</spoolToDisk>
+ <outputThreshold>678900</outputThreshold>
+ </params>
+ <contentHandlerDecoratorFactory class="org.apache.tika.sax.UpcasingContentHandlerDecoratorFactory"/>
+ </autoDetectParserConfig>
+</properties>