You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/14 16:09:09 UTC
[tika] branch main updated: TIKA-3792 -- only apply the handler decorator once for legacy xhtml processing of embedded documents
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 0ea65717a TIKA-3792 -- only apply the handler decorator once for legacy xhtml processing of embedded documents
0ea65717a is described below
commit 0ea65717a449c5d248b4f6dd980f783069358fe0
Author: tallison <ta...@apache.org>
AuthorDate: Tue Jun 14 12:08:56 2022 -0400
TIKA-3792 -- only apply the handler decorator once for legacy xhtml processing of embedded documents
---
.../org/apache/tika/parser/AutoDetectParser.java | 21 ++++++++++++--
.../apache/tika/parser/AutoDetectParserConfig.java | 32 ++++++++++++++++------
.../apache/tika/parser/RecursiveParserWrapper.java | 2 +-
.../tika/sax/ContentHandlerDecoratorFactory.java | 11 ++++++++
.../tika/parser/AutoDetectParserConfigTest.java | 32 ++++++++++++++++++++++
...=> DoublingContentHandlerDecoratorFactory.java} | 13 ++++++++-
.../UpcasingContentHandlerDecoratorFactory.java | 7 +++++
...ka-config-doubling-custom-handler-decorator.xml | 29 ++++++++++++++++++++
8 files changed, 134 insertions(+), 13 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 054e71ea3..9ae5b8522 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -154,8 +154,7 @@ public class AutoDetectParser extends CompositeParser {
}
tis.reset();
}
- handler = autoDetectParserConfig.getContentHandlerDecoratorFactory()
- .decorate(handler, metadata);
+ handler = decorateHandler(handler, metadata, context, autoDetectParserConfig);
// TIKA-216: Zip bomb prevention
SecureContentHandler sch =
handler != null ?
@@ -176,6 +175,24 @@ public class AutoDetectParser extends CompositeParser {
}
}
+ private ContentHandler decorateHandler(ContentHandler handler,
+ Metadata metadata, ParseContext context,
+ AutoDetectParserConfig autoDetectParserConfig) {
+ if (context.get(RecursiveParserWrapper.RecursivelySecureContentHandler.class) != null) {
+ //using the recursiveparserwrapper. we should decorate this handler
+ return autoDetectParserConfig
+ .getContentHandlerDecoratorFactory()
+ .decorate(handler, metadata, context);
+ }
+ ParseRecord parseRecord = context.get(ParseRecord.class);
+ if (parseRecord == null || parseRecord.getDepth() == 0) {
+ return autoDetectParserConfig.getContentHandlerDecoratorFactory()
+ .decorate(handler, metadata, context);
+ }
+ //else do not decorate
+ return handler;
+ }
+
private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig,
Metadata metadata) throws IOException {
if (! tis.hasFile() && //if there's already a file, stop now
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 267915b42..8980dbce5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -20,11 +20,13 @@ import java.io.IOException;
import java.io.Serializable;
import org.w3c.dom.Element;
+import org.xml.sax.ContentHandler;
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
import org.apache.tika.sax.ContentHandlerDecoratorFactory;
@@ -36,16 +38,26 @@ import org.apache.tika.sax.ContentHandlerDecoratorFactory;
*/
public class AutoDetectParserConfig extends ConfigBase implements Serializable {
- private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY
- = (contentHandler, metadata) -> contentHandler;
+ private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY =
+ new ContentHandlerDecoratorFactory() {
+ @Override
+ public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+ return contentHandler;
+ }
+
+ @Override
+ public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+ ParseContext parseContext) {
+ return contentHandler;
+ }
+ };
public static AutoDetectParserConfig DEFAULT = new AutoDetectParserConfig();
public static AutoDetectParserConfig load(Element element)
throws TikaConfigException, IOException {
- return AutoDetectParserConfig.buildSingle(
- "autoDetectParserConfig", AutoDetectParserConfig.class, element,
- AutoDetectParserConfig.DEFAULT);
+ return AutoDetectParserConfig.buildSingle("autoDetectParserConfig",
+ AutoDetectParserConfig.class, element, AutoDetectParserConfig.DEFAULT);
}
/**
@@ -84,7 +96,7 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
NOOP_CONTENT_HANDLER_DECORATOR_FACTORY;
/**
- * Creates a SecureContentHandlerConfig using the passed in parameters.
+ * Creates a SecureContentHandlerConfig using the passed in parameters.
*
* @param spoolToDisk
* @param outputThreshold SecureContentHandler - character output threshold.
@@ -150,10 +162,11 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
return this.metadataWriteFilterFactory;
}
- public void setMetadataWriteFilterFactory(MetadataWriteFilterFactory metadataWriteFilterFactory) {
+ public void setMetadataWriteFilterFactory(
+ MetadataWriteFilterFactory metadataWriteFilterFactory) {
this.metadataWriteFilterFactory = metadataWriteFilterFactory;
}
-
+
public void setEmbeddedDocumentExtractorFactory(
EmbeddedDocumentExtractorFactory embeddedDocumentExtractorFactory) {
this.embeddedDocumentExtractorFactory = embeddedDocumentExtractorFactory;
@@ -163,7 +176,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
return embeddedDocumentExtractorFactory;
}
- public void setContentHandlerDecoratorFactory(ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) {
+ public void setContentHandlerDecoratorFactory(
+ ContentHandlerDecoratorFactory contentHandlerDecoratorFactory) {
this.contentHandlerDecoratorFactory = contentHandlerDecoratorFactory;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 59db9b3f9..d660cc175 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -287,7 +287,7 @@ public class RecursiveParserWrapper extends ParserDecorator {
}
}
- private static class RecursivelySecureContentHandler extends SecureContentHandler {
+ static class RecursivelySecureContentHandler extends SecureContentHandler {
private ContentHandler handler;
//total allowable chars across all handlers
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
index 05f7045f3..52a513560 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
@@ -21,8 +21,19 @@ import java.io.Serializable;
import org.xml.sax.ContentHandler;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
public interface ContentHandlerDecoratorFactory extends Serializable {
+ /**
+ * @deprecated use {@link ContentHandlerDecoratorFactory#decorate(ContentHandler, Metadata, ParseContext)}
+ * This will be removed in 2.5.0
+ * @param contentHandler
+ * @param metadata
+ * @return
+ */
+ @Deprecated
ContentHandler decorate(ContentHandler contentHandler, Metadata metadata);
+ ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+ ParseContext parseContext);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 2e84fa5d4..38c0ea95f 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -64,4 +64,36 @@ public class AutoDetectParserConfigTest extends TikaTest {
Metadata pdfMetadata2 = metadataList.get(5);
assertContains("HELLO WORLD", pdfMetadata2.get(TikaCoreProperties.TIKA_CONTENT));
}
+
+ @Test
+ public void testRecursiveContentHandlerDecoratorFactory() throws Exception {
+ TikaConfig tikaConfig = null;
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-doubling-custom-handler-decorator.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ List<Metadata> metadataList = getRecursiveMetadata("testPPT_EmbeddedPDF.pptx", p);
+ assertContainsCount("IMAGE2.EMF",
+ metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 2);
+ assertContainsCount("15.9.2007 11:02",
+ metadataList.get(4).get(TikaCoreProperties.TIKA_CONTENT), 2);
+ assertContainsCount("HELLO WORLD",
+ metadataList.get(5).get(TikaCoreProperties.TIKA_CONTENT), 4);
+ }
+
+ @Test
+ public void testXMLContentHandlerDecoratorFactory() throws Exception {
+ //test to make sure that the decorator is only applied once for
+ //legacy (e.g. not RecursiveParserWrapperHandler) parsing
+ TikaConfig tikaConfig = null;
+ try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+ "/configs/tika-config-doubling-custom-handler-decorator.xml")) {
+ tikaConfig = new TikaConfig(is);
+ }
+ Parser p = new AutoDetectParser(tikaConfig);
+ String txt = getXML("testPPT_EmbeddedPDF.pptx", p).xml;
+ assertContainsCount("THE APACHE TIKA PROJECT WAS FORMALLY", txt, 2);
+ assertContainsCount("15.9.2007 11:02", txt, 2);
+ }
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java
similarity index 72%
copy from tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
copy to tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java
index feec91007..cc71ca398 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java
@@ -22,15 +22,26 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+public class DoublingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
+ private static final char[] NEWLINE = new char[]{'\n'};
-public class UpcasingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
@Override
public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+ return decorate(contentHandler, metadata, new ParseContext());
+ }
+
+ @Override
+ public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+ ParseContext parseContext) {
return new ContentHandlerDecorator(contentHandler) {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
String content = new String(ch, start, length).toUpperCase(Locale.US);
contentHandler.characters(content.toCharArray(), start, length);
+ contentHandler.characters(NEWLINE, 0, 1);
+ contentHandler.characters(content.toCharArray(), start, length);
}
};
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
index feec91007..f78aa29e7 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/UpcasingContentHandlerDecoratorFactory.java
@@ -22,10 +22,17 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
public class UpcasingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory {
@Override
public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata) {
+ return decorate(contentHandler, metadata, new ParseContext());
+ }
+
+ @Override
+ public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
+ ParseContext parseContext) {
return new ContentHandlerDecorator(contentHandler) {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
new file mode 100644
index 000000000..a15cb86b6
--- /dev/null
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>123450</spoolToDisk>
+ <outputThreshold>678900</outputThreshold>
+ </params>
+ <contentHandlerDecoratorFactory class="org.apache.tika.sax.DoublingContentHandlerDecoratorFactory"/>
+ </autoDetectParserConfig>
+</properties>