You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2024/03/21 17:48:34 UTC
(tika) 02/02: TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-4207
in repository https://gitbox.apache.org/repos/asf/tika.git
commit 9ffc4df4a3d059d54e1e1851b8d024b24d2043f9
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 21 13:48:16 2024 -0400
TIKA-4207 -- allow users to configure include/exclude for attachment types and/or mime types
---
.../tika/extractor/BasicEmbeddedBytesSelector.java | 77 ++++++++++++++++++++++
...ctorFactory.java => EmbeddedBytesSelector.java} | 24 +++----
.../ParsingEmbeddedDocumentExtractor.java | 28 +++++++-
.../ParsingEmbeddedDocumentExtractorFactory.java | 56 ++++++++++++++--
.../apache/tika/metadata/TikaCoreProperties.java | 4 ++
.../tika/parser/AutoDetectParserConfigTest.java | 72 ++++++++++++++++++++
.../config/TIKA-4207-embedded-bytes-config.xml | 38 +++++++++++
7 files changed, 277 insertions(+), 22 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
new file mode 100644
index 000000000..1d5a239db
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.extractor;
+
+import java.util.Set;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.utils.StringUtils;
+
+public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector {
+
+
+
+ private final Set<String> includeMimes;
+ private final Set<String> excludeMimes;
+ private final Set<String> includeEmbeddedResourceTypes;
+
+ private final Set<String> excludeEmbeddedResourceTypes;
+
+ public BasicEmbeddedBytesSelector(Set<String> includeMimes, Set<String> excludeMimes,
+ Set<String> includeEmbeddedResourceTypes,
+ Set<String> excludeEmbeddedResourceTypes) {
+ this.includeMimes = includeMimes;
+ this.excludeMimes = excludeMimes;
+ this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes;
+ this.excludeEmbeddedResourceTypes = excludeEmbeddedResourceTypes;
+ }
+
+ public boolean select(Metadata metadata) {
+ String mime = metadata.get(Metadata.CONTENT_TYPE);
+ if (mime == null) {
+ mime = "";
+ } else {
+ //if mime matters at all, make sure to get the mime without parameters
+ if (includeMimes.size() > 0 || excludeMimes.size() > 0) {
+ MediaType mt = MediaType.parse(mime);
+ if (mt != null) {
+ mime = mt.getType() + "/" + mt.getSubtype();
+ }
+ }
+ }
+ if (excludeMimes.contains(mime)) {
+ return false;
+ }
+ if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) {
+ return false;
+ }
+ String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
+ //if a parser doesn't specify the type, treat it as ATTACHMENT
+ embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" :
+ embeddedResourceType;
+
+ if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+ return false;
+ }
+ if (includeEmbeddedResourceTypes.size() > 0 && includeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+ return true;
+ }
+ return false;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
similarity index 55%
copy from tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
copy to tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
index 9136228c4..2ec7df667 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
@@ -16,25 +16,17 @@
*/
package org.apache.tika.extractor;
-import org.apache.tika.config.Field;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-public class ParsingEmbeddedDocumentExtractorFactory
- implements EmbeddedDocumentExtractorFactory {
+public interface EmbeddedBytesSelector {
- private boolean writeFileNameToContent = true;
-
- @Field
- public void setWriteFileNameToContent(boolean writeFileNameToContent) {
- this.writeFileNameToContent = writeFileNameToContent;
+ class AcceptAll implements EmbeddedBytesSelector {
+ @Override
+ public boolean select(Metadata metadata) {
+ return true;
+ }
}
+ EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll();
- @Override
- public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- ParsingEmbeddedDocumentExtractor ex =
- new ParsingEmbeddedDocumentExtractor(parseContext);
- ex.setWriteFileNameToContent(writeFileNameToContent);
- return ex;
- }
+ boolean select(Metadata metadata);
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index 46672838b..ee15c1e22 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -26,6 +26,8 @@ import java.nio.file.Files;
import java.nio.file.Path;
import org.apache.commons.io.input.CloseShieldInputStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -43,6 +45,7 @@ import org.apache.tika.parser.ParseRecord;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
/**
* Helper class for parsers of package archives or other compound document
@@ -52,6 +55,9 @@ import org.apache.tika.sax.EmbeddedContentHandler;
*/
public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {
+ private static final Logger LOGGER =
+ LoggerFactory.getLogger(ParsingEmbeddedDocumentExtractor.class);
+
private static final File ABSTRACT_PATH = new File("");
private static final Parser DELEGATING_PARSER = new DelegatingParser();
@@ -60,6 +66,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
private final ParseContext context;
+ private EmbeddedBytesSelector embeddedBytesSelector = EmbeddedBytesSelector.ACCEPT_ALL;
+
public ParsingEmbeddedDocumentExtractor(ParseContext context) {
this.context = context;
}
@@ -147,6 +155,14 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
}
private void storeEmbeddedBytes(Path p, Metadata metadata) {
+ if (! embeddedBytesSelector.select(metadata)) {
+ if (LOGGER.isDebugEnabled()) {
+ LOGGER.debug("skipping embedded bytes {} {}",
+ metadata.get(Metadata.CONTENT_TYPE),
+ metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
+ }
+ return;
+ }
EmbeddedDocumentByteStore embeddedDocumentByteStore =
context.get(EmbeddedDocumentByteStore.class);
int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
@@ -154,8 +170,8 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
try {
embeddedDocumentByteStore.add(id, metadata, Files.readAllBytes(p));
} catch (IOException e) {
- e.printStackTrace();
- //log, or better, store embdocstore exception
+ metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
+ ExceptionUtils.getStackTrace(e));
}
}
@@ -175,4 +191,12 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
}
+
+ public void setEmbeddedBytesSelector(EmbeddedBytesSelector embeddedBytesSelector) {
+ this.embeddedBytesSelector = embeddedBytesSelector;
+ }
+
+ public EmbeddedBytesSelector getEmbeddedBytesSelector() {
+ return embeddedBytesSelector;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 9136228c4..7632ed49c 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -16,25 +16,73 @@
*/
package org.apache.tika.extractor;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
import org.apache.tika.config.Field;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-public class ParsingEmbeddedDocumentExtractorFactory
- implements EmbeddedDocumentExtractorFactory {
+public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory {
private boolean writeFileNameToContent = true;
+ private Set<String> embeddedBytesIncludeMimeTypes = Collections.EMPTY_SET;
+ private Set<String> embeddedBytesExcludeMimeTypes = Collections.EMPTY_SET;
+ private Set<String> embeddedBytesIncludeEmbeddedResourceTypes = Collections.EMPTY_SET;
+ private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET;
@Field
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
}
+ @Field
+ public void setEmbeddedBytesIncludeMimeTypes(List<String> includeMimeTypes) {
+ embeddedBytesIncludeMimeTypes = new HashSet<>();
+ embeddedBytesIncludeMimeTypes.addAll(includeMimeTypes);
+ }
+
+ @Field
+ public void setEmbeddedBytesExcludeMimeTypes(List<String> excludeMimeTypes) {
+ embeddedBytesExcludeMimeTypes = new HashSet<>();
+ embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
+
+ }
+
+ @Field
+ public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> includeAttachmentTypes) {
+ embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
+ embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
+
+ }
+
+ @Field
+ public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> excludeAttachmentTypes) {
+ embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
+ embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
+
+ }
+
+
@Override
public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- ParsingEmbeddedDocumentExtractor ex =
- new ParsingEmbeddedDocumentExtractor(parseContext);
+ ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext);
ex.setWriteFileNameToContent(writeFileNameToContent);
+ ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
return ex;
}
+
+ private EmbeddedBytesSelector createEmbeddedBytesSelector() {
+ if (embeddedBytesIncludeMimeTypes.size() == 0 &&
+ embeddedBytesExcludeMimeTypes.size() == 0 &&
+ embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 &&
+ embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
+ return EmbeddedBytesSelector.ACCEPT_ALL;
+ }
+ return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes,
+ embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes,
+ embeddedBytesExcludeEmbeddedResourceTypes);
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 6ff02c1cf..effa4a667 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -98,6 +98,10 @@ public interface TikaCoreProperties {
Property EMBEDDED_EXCEPTION =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+ //exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore
+ Property EMBEDDED_BYTES_EXCEPTION =
+ Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception");
+
//warning while parsing in an embedded file
Property EMBEDDED_WARNING =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
new file mode 100644
index 000000000..a0d5d4896
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.io.InputStream;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.extractor.EmbeddedBytesSelector;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor;
+import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.utils.StringUtils;
+
+public class AutoDetectParserConfigTest {
+
+ @Test
+ public void testEmbeddedBytesSelector() throws Exception {
+ TikaConfig config;
+ try (InputStream is = TikaConfig.class.getResourceAsStream(
+ "TIKA-4207-embedded-bytes-config.xml")) {
+ config = new TikaConfig(is);
+ }
+ AutoDetectParserConfig c = config.getAutoDetectParserConfig();
+ ParsingEmbeddedDocumentExtractorFactory f =
+ (ParsingEmbeddedDocumentExtractorFactory) c.getEmbeddedDocumentExtractorFactory();
+
+ Metadata metadata = new Metadata();
+ ParseContext parseContext = new ParseContext();
+ ParsingEmbeddedDocumentExtractor ex = (ParsingEmbeddedDocumentExtractor) f.newInstance(metadata, parseContext);
+ EmbeddedBytesSelector selector = ex.getEmbeddedBytesSelector();
+ assertFalse(selector.select(getMetadata("", "")));
+ assertTrue(selector.select(getMetadata("application/pdf", "")));
+ assertTrue(selector.select(getMetadata("application/pdf", "ATTACHMENT")));
+ assertTrue(selector.select(getMetadata("application/pdf", "INLINE")));
+ assertTrue(selector.select(getMetadata("text/plain;charset=UTF-7", "INLINE")));
+
+ assertFalse(selector.select(getMetadata("application/pdf", "MACRO")));
+ assertFalse(selector.select(getMetadata("application/docx", "")));
+
+ }
+
+ private Metadata getMetadata(String mime, String embeddedResourceType) {
+ Metadata m = new Metadata();
+ if (!StringUtils.isBlank(mime)) {
+ m.set(Metadata.CONTENT_TYPE, mime);
+ }
+ if (!StringUtils.isBlank(embeddedResourceType)) {
+ m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, embeddedResourceType);
+ }
+ return m;
+ }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
new file mode 100644
index 000000000..d60c6b1ca
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-4207-embedded-bytes-config.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <spoolToDisk>123450</spoolToDisk>
+ <outputThreshold>678900</outputThreshold>
+ <embeddedDocumentExtractorFactory class="org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory">
+ <writeFileNameToContent>false</writeFileNameToContent>
+ <embeddedBytesIncludeMimeTypes>
+ <mime>application/pdf</mime>
+ <mime>application/rtf</mime>
+ <mime>text/plain</mime>
+ </embeddedBytesIncludeMimeTypes>
+ <embeddedBytesIncludeEmbeddedResourceTypes>
+ <type>ATTACHMENT</type>
+ <type>INLINE</type>
+ </embeddedBytesIncludeEmbeddedResourceTypes>
+ </embeddedDocumentExtractorFactory>
+ </autoDetectParserConfig>
+</properties>
\ No newline at end of file