You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/17 17:47:05 UTC
[tika] 02/06: TIKA-3137 -- first pass,
need to add unit tests for tika-batch
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit db4498d1de534f8348e94b0f27c641353a26b083
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 16 15:58:00 2020 -0400
TIKA-3137 -- first pass, need to add unit tests for tika-batch
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +-
.../batch/fs/RecursiveParserWrapperFSConsumer.java | 9 +-
.../tika/batch/fs/StreamOutRPWFSConsumer.java | 20 ++-
.../fs/builders/BasicTikaFSConsumersBuilder.java | 11 +-
.../RecursiveParserWrapperFSConsumerTest.java | 5 +-
.../java/org/apache/tika/config/TikaConfig.java | 108 ++++++++++++-
.../metadata/filter/ClearByMimeMetadataFilter.java | 74 +++++++++
.../metadata/filter/CompositeMetadataFilter.java | 38 +++++
.../metadata/filter/DefaultMetadataFilter.java | 46 ++++++
.../filter/ExcludeFieldMetadataFilter.java | 53 +++++++
.../filter/IncludeFieldMetadataFilter.java | 58 +++++++
.../tika/metadata/filter/MetadataFilter.java | 33 ++++
.../apache/tika/metadata/filter/NoOpFilter.java | 34 +++++
.../tika/sax/RecursiveParserWrapperHandler.java | 31 +++-
.../org.apache.tika.metadata.filter.MetadataFilter | 16 ++
.../org/apache/tika/config/TikaConfigTest.java | 2 +
.../tika/metadata/filter/MockUpperCaseFilter.java | 39 +++++
.../tika/metadata/filter/TestMetadataFilter.java | 170 +++++++++++++++++++++
.../org/apache/tika/config/TIKA-3137-exclude.xml | 26 ++++
.../apache/tika/config/TIKA-3137-include-uc.xml | 27 ++++
.../org/apache/tika/config/TIKA-3137-include.xml | 26 ++++
.../org/apache/tika/config/TIKA-3137-mimes-uc.xml | 27 ++++
.../tika/parser/RecursiveParserWrapperTest.java | 43 ++++++
.../org/apache/tika/parser/TIKA-3137-include.xml | 31 ++++
.../server/resource/RecursiveMetadataResource.java | 3 +-
.../java/org/apache/tika/server/CXFTestBase.java | 7 +-
.../tika/server/RecursiveMetadataFilterTest.java | 107 +++++++++++++
.../org/apache/tika/server/TIKA-3137-include.xml | 31 ++++
28 files changed, 1062 insertions(+), 17 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 8077114..46f82ee 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -513,7 +513,9 @@ public class TikaCLI {
private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
Metadata metadata = new Metadata();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1);
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(getContentHandlerFactory(type),
+ -1, config.getMetadataFilter());
try (InputStream input = TikaInputStream.get(url, metadata)) {
wrapper.parse(input, handler, metadata, context);
}
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
index 56b8b58..9732781 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
@@ -32,6 +32,8 @@ import org.apache.tika.batch.ParserFactory;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -50,6 +52,7 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
private final Parser parser;
private final ContentHandlerFactory contentHandlerFactory;
private final OutputStreamFactory fsOSFactory;
+ private final MetadataFilter metadataFilter;
private String outputEncoding = "UTF-8";
/**
@@ -62,11 +65,12 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> queue,
Parser parser,
ContentHandlerFactory contentHandlerFactory,
- OutputStreamFactory fsOSFactory) {
+ OutputStreamFactory fsOSFactory, MetadataFilter metadataFilter) {
super(queue);
this.contentHandlerFactory = contentHandlerFactory;
this.fsOSFactory = fsOSFactory;
this.parser = parser;
+ this.metadataFilter = metadataFilter;
}
@Override
@@ -95,7 +99,8 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
Throwable thrown = null;
List<Metadata> metadataList = null;
Metadata containerMetadata = fileResource.getMetadata();
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, -1);
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
+ -1, metadataFilter);
try {
parse(fileResource.getResourceId(), parser, is, handler,
containerMetadata, context);
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java
index 018c1a9..dd39a6c 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java
@@ -20,12 +20,15 @@ package org.apache.tika.batch.fs;
import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.OutputStreamFactory;
import org.apache.tika.batch.ParserFactory;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.serialization.JsonStreamingSerializer;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -53,17 +56,19 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
private final Parser parser;
private final ContentHandlerFactory contentHandlerFactory;
private final OutputStreamFactory fsOSFactory;
+ private final MetadataFilter metadataFilter;
private String outputEncoding = "UTF-8";
public StreamOutRPWFSConsumer(ArrayBlockingQueue<FileResource> queue,
Parser parser,
ContentHandlerFactory contentHandlerFactory,
- OutputStreamFactory fsOSFactory) {
+ OutputStreamFactory fsOSFactory, MetadataFilter metadataFilter) {
super(queue);
this.contentHandlerFactory = contentHandlerFactory;
this.fsOSFactory = fsOSFactory;
this.parser = parser;
+ this.metadataFilter = metadataFilter;
}
@Override
@@ -93,7 +98,8 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
JsonStreamingSerializer writer = new JsonStreamingSerializer(
new OutputStreamWriter(os, StandardCharsets.UTF_8));
- WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory, writer);
+ WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory,
+ writer, metadataFilter);
Throwable thrown = null;
try {
parse(fileResource.getResourceId(), parser, is, handler,
@@ -137,16 +143,24 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
//be written straight to disk.
private class WriteoutRPWHandler extends AbstractRecursiveParserWrapperHandler {
private final JsonStreamingSerializer jsonWriter;
+ private final MetadataFilter metadataFilter;
- public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer) {
+ public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer,
+ MetadataFilter metadataFilter) {
super(contentHandlerFactory);
this.jsonWriter = writer;
+ this.metadataFilter = metadataFilter;
}
@Override
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
metadata.add(RecursiveParserWrapperHandler.TIKA_CONTENT, contentHandler.toString());
try {
+ metadataFilter.filter(metadata);
+ } catch (TikaException e) {
+ throw new SAXException(e);
+ }
+ try {
jsonWriter.add(metadata);
} catch (IOException e) {
throw new SAXException(e);
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
index 88171ee..4f05324 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
@@ -42,6 +42,9 @@ import org.apache.tika.batch.fs.FSUtil;
import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
import org.apache.tika.batch.fs.StreamOutRPWFSConsumer;
import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -145,15 +148,19 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
contentHandlerFactory, recursiveParserWrapper);
Parser parser = parserFactory.getParser(config);
if (recursiveParserWrapper) {
+ MetadataFilter metadataFilter = config.getMetadataFilter();
parser = new RecursiveParserWrapper(parser);
+
for (int i = 0; i < numConsumers; i++) {
FileResourceConsumer c = null;
if (streamOut){
c = new StreamOutRPWFSConsumer(queue,
- parser, contentHandlerFactory, outputStreamFactory);
+ parser, contentHandlerFactory,
+ outputStreamFactory, metadataFilter);
} else {
c = new RecursiveParserWrapperFSConsumer(queue,
- parser, contentHandlerFactory, outputStreamFactory);
+ parser, contentHandlerFactory,
+ outputStreamFactory, metadataFilter);
}
consumers.add(c);
}
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java
index 7ebe564..6a61414 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
@@ -75,7 +76,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest {
Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig()));
RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(
queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- mockOSFactory);
+ mockOSFactory, NoOpFilter.NOOP_FILTER);
IFileProcessorFutureResult result = consumer.call();
mockOSFactory.getStreams().get(0).flush();
@@ -123,7 +124,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest {
Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig()));
RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(
queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- mockOSFactory);
+ mockOSFactory, NoOpFilter.NOOP_FILTER);
IFileProcessorFutureResult result = consumer.call();
mockOSFactory.getStreams().get(0).flush();
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 92485d3..a0cc102 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -50,6 +50,9 @@ import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
+import org.apache.tika.metadata.filter.DefaultMetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
@@ -104,6 +107,10 @@ public class TikaConfig {
return new SimpleThreadPoolExecutor();
}
+ private static MetadataFilter getDefaultMetadataFilter(ServiceLoader loader) {
+ return new DefaultMetadataFilter(loader);
+ }
+
//use this to look for unneeded instantiations of TikaConfig
protected static AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
@@ -115,6 +122,7 @@ public class TikaConfig {
private final MimeTypes mimeTypes;
private final ExecutorService executorService;
private final EncodingDetector encodingDetector;
+ private final MetadataFilter metadataFilter;
public TikaConfig(String file)
throws TikaException, IOException, SAXException {
@@ -180,6 +188,7 @@ public class TikaConfig {
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader();
+ MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader();
updateXMLReaderUtils(element);
this.mimeTypes = typesFromDomElement(element);
this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
@@ -189,6 +198,7 @@ public class TikaConfig {
this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
+ this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, loader);
this.serviceLoader = loader;
TIMES_INSTANTIATED.incrementAndGet();
}
@@ -214,6 +224,7 @@ public class TikaConfig {
this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
+ this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
TIMES_INSTANTIATED.incrementAndGet();
}
@@ -249,6 +260,7 @@ public class TikaConfig {
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
+ this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) {
@@ -259,7 +271,8 @@ public class TikaConfig {
EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader();
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
-
+ MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader();
+
this.mimeTypes = typesFromDomElement(element);
this.encodingDetector = encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
@@ -269,6 +282,7 @@ public class TikaConfig {
this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader);
+ this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader);
} catch (SAXException e) {
throw new TikaException(
"Specified Tika configuration has syntax errors: "
@@ -393,6 +407,9 @@ public class TikaConfig {
return serviceLoader;
}
+ public MetadataFilter getMetadataFilter() {
+ return metadataFilter;
+ }
/**
* Provides a default configuration (TikaConfig). Currently creates a
* new instance each time it's called; we may be able to have it
@@ -1101,7 +1118,8 @@ public class TikaConfig {
}
@Override
- CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors, MimeTypes mimeTypes, ServiceLoader loader) {
+ CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors,
+ MimeTypes mimeTypes, ServiceLoader loader) {
return new CompositeEncodingDetector(encodingDetectors);
}
@@ -1142,5 +1160,91 @@ public class TikaConfig {
}
}
+ private static class MetadataFilterXmlLoader extends
+ XmlLoader<MetadataFilter, MetadataFilter> {
+
+ boolean supportsComposite() {
+ return true;
+ }
+
+ String getParentTagName() {
+ return "metadataFilters";
+ }
+
+ String getLoaderTagName() {
+ return "metadataFilter";
+ }
+
+ @Override
+ Class<? extends MetadataFilter> getLoaderClass() {
+ return MetadataFilter.class;
+ }
+
+
+ @Override
+ boolean isComposite(MetadataFilter loaded) {
+ return loaded instanceof CompositeMetadataFilter;
+ }
+
+ @Override
+ boolean isComposite(Class<? extends MetadataFilter> loadedClass) {
+ return CompositeMetadataFilter.class.isAssignableFrom(loadedClass);
+ }
+
+ @Override
+ MetadataFilter preLoadOne(Class<? extends MetadataFilter> loadedClass,
+ String classname, MimeTypes mimeTypes) throws TikaException {
+ // Check for classes which can't be set in config
+ // Continue with normal loading
+ return null;
+ }
+
+ @Override
+ MetadataFilter createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
+ return getDefaultMetadataFilter(loader);
+ }
+
+ //this ignores the service loader
+ @Override
+ MetadataFilter createComposite(List<MetadataFilter> loaded, MimeTypes mimeTypes, ServiceLoader loader) {
+ return new DefaultMetadataFilter(loaded);
+ }
+
+ @Override
+ MetadataFilter createComposite(Class<? extends MetadataFilter> metadataFilterClass,
+ List<MetadataFilter> childMetadataFilters,
+ Set<Class<? extends MetadataFilter>> excludeFilters,
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
+ throws InvocationTargetException, IllegalAccessException,
+ InstantiationException {
+ MetadataFilter metadataFilter = null;
+ Constructor<? extends MetadataFilter> c;
+
+ // Try the possible default and composite detector constructors
+ if (metadataFilter == null) {
+ try {
+ c = metadataFilterClass.getConstructor(ServiceLoader.class, Collection.class);
+ metadataFilter = c.newInstance(loader, excludeFilters);
+ } catch (NoSuchMethodException me) {
+ me.printStackTrace();
+ }
+ }
+ if (metadataFilter == null) {
+ try {
+ c = metadataFilterClass.getConstructor(List.class);
+ metadataFilter = c.newInstance(childMetadataFilters);
+ } catch (NoSuchMethodException me) {
+ me.printStackTrace();
+ }
+ }
+
+ return metadataFilter;
+ }
+
+ @Override
+ MetadataFilter decorate(MetadataFilter created, Element element) {
+ return created; // No decoration of MetadataFilters
+ }
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
new file mode 100644
index 0000000..05324f2
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * This class clears the entire metadata object if the
+ * mime matches the mime filter. The idea is that you might not want
+ * to store/transmit metadata for images or specific file types.
+ */
+public class ClearByMimeMetadataFilter implements MetadataFilter {
+ private final Set<String> mimes;
+
+ public ClearByMimeMetadataFilter() {
+ this(new HashSet<>());
+ }
+
+ public ClearByMimeMetadataFilter(Set<String> mimes) {
+ this.mimes = mimes;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ String mimeString = metadata.get(Metadata.CONTENT_TYPE);
+ if (mimeString == null) {
+ return;
+ }
+ MediaType mt = MediaType.parse(mimeString);
+ if (mt != null) {
+ mimeString = mt.getBaseType().toString();
+ }
+ if (mimes.contains(mimeString)) {
+ for (String n : metadata.names()) {
+ metadata.remove(n);
+ }
+
+ }
+ }
+
+ /**
+ *
+ * @param mimesString comma-delimited list of mimes that will trigger complete removal of metadata
+ */
+ @Field
+ public void setMimes(String mimesString) {
+ for (String include : mimesString.split(",")) {
+ mimes.add(include);
+ }
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
new file mode 100644
index 0000000..4d592c9
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.List;
+
+public class CompositeMetadataFilter implements MetadataFilter {
+
+ private final List<MetadataFilter> filters;
+
+ public CompositeMetadataFilter(List<MetadataFilter> filters) {
+ this.filters = filters;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ for (MetadataFilter filter : filters) {
+ filter.filter(metadata);
+ }
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
new file mode 100644
index 0000000..7671f50
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+import java.util.List;
+
+public class DefaultMetadataFilter extends CompositeMetadataFilter {
+
+ private static List<MetadataFilter> getDefaultFilters(
+ ServiceLoader loader) {
+ List<MetadataFilter> detectors = loader.loadStaticServiceProviders(MetadataFilter.class);
+ ServiceLoaderUtils.sortLoadedClasses(detectors);
+
+ return detectors;
+ }
+
+ public DefaultMetadataFilter(ServiceLoader serviceLoader) {
+ super(getDefaultFilters(serviceLoader));
+ }
+
+ public DefaultMetadataFilter(List<MetadataFilter> metadataFilters) {
+ super(metadataFilters);
+ }
+
+ public DefaultMetadataFilter() {
+ this(new ServiceLoader());
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
new file mode 100644
index 0000000..3b6e2a0
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class ExcludeFieldMetadataFilter implements MetadataFilter {
+ private final Set<String> exclude;
+
+ public ExcludeFieldMetadataFilter() {
+ this(new HashSet<>());
+ }
+ public ExcludeFieldMetadataFilter(Set<String> exclude) {
+ this.exclude = exclude;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ for (String field : exclude) {
+ metadata.remove(field);
+ }
+ }
+
+ /**
+ *
+ * @param excludeString comma-delimited list of fields to exclude
+ */
+ @Field
+ public void setExclude(String excludeString) {
+ for (String include : excludeString.split(",")) {
+ exclude.add(include);
+ }
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
new file mode 100644
index 0000000..4bc6c9e
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class IncludeFieldMetadataFilter implements MetadataFilter {
+ private final Set<String> includeSet;
+
+ public IncludeFieldMetadataFilter() {
+ this(new HashSet<>());
+ }
+
+ public IncludeFieldMetadataFilter(Set<String> fields) {
+ this.includeSet = fields;
+ }
+
+ /**
+ *
+ * @param includeString comma-delimited list of fields to include
+ */
+ @Field
+ public void setInclude(String includeString) {
+ for (String include : includeString.split(",")) {
+ includeSet.add(include);
+ }
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+
+ for (String n : metadata.names()) {
+ if (! includeSet.contains(n)) {
+ metadata.remove(n);
+ }
+ }
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
new file mode 100644
index 0000000..7a8f345
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.Serializable;
+
+/**
+ * Filters the metadata in place
+ *
+ * @since Apache Tika 1.25
+ */
+public interface MetadataFilter extends Serializable {
+
+ void filter(Metadata metadata) throws TikaException;
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
new file mode 100644
index 0000000..9cd1ec3
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * This filter performs no operations on the metadata
+ * and leaves it untouched.
+ */
+public class NoOpFilter implements MetadataFilter {
+
+ public static NoOpFilter NOOP_FILTER = new NoOpFilter();
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ //no op
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 408598f..50f0fb8 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -16,7 +16,10 @@
*/
package org.apache.tika.sax;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -40,12 +43,13 @@ import java.util.List;
public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler {
protected final List<Metadata> metadataList = new LinkedList<>();
+ private final MetadataFilter metadataFilter;
/**
* Create a handler with no limit on the number of embedded resources
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
- super(contentHandlerFactory);
+ this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
}
/**
@@ -54,7 +58,13 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
* @param maxEmbeddedResources number of embedded resources that will be parsed
*/
public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
+ this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER);
+ }
+
+ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources,
+ MetadataFilter metadataFilter) {
super(contentHandlerFactory, maxEmbeddedResources);
+ this.metadataFilter = metadataFilter;
}
/**
@@ -79,7 +89,15 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
super.endEmbeddedDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
- metadataList.add(ParserUtils.cloneMetadata(metadata));
+ try {
+ metadataFilter.filter(metadata);
+ } catch (TikaException e) {
+ throw new SAXException(e);
+ }
+
+ if (metadata.size() > 0) {
+ metadataList.add(ParserUtils.cloneMetadata(metadata));
+ }
}
/**
@@ -92,8 +110,15 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
super.endDocument(contentHandler, metadata);
addContent(contentHandler, metadata);
+ try {
+ metadataFilter.filter(metadata);
+ } catch (TikaException e) {
+ throw new SAXException(e);
+ }
- metadataList.add(0, ParserUtils.cloneMetadata(metadata));
+ if (metadata.size() > 0) {
+ metadataList.add(0, ParserUtils.cloneMetadata(metadata));
+ }
}
/**
diff --git a/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter b/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter
new file mode 100644
index 0000000..604a480
--- /dev/null
+++ b/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+org.apache.tika.metadata.filter.NoOpFilter
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 5c406cd..1b8722d 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -327,4 +327,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
getConfig("TIKA-2732-xmlreaderutils-exc.xml");
}
+
+
}
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
new file mode 100644
index 0000000..0632dd4
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.Locale;
+
+/**
+ * Mock Filter for testing uppercasing of all values
+ */
+public class MockUpperCaseFilter implements MetadataFilter {
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ for (String n : metadata.names()) {
+ String[] vals = metadata.getValues(n);
+ metadata.remove(n);
+ for (int i = 0; i < vals.length; i++) {
+ metadata.add(n, vals[i].toUpperCase(Locale.US));
+ }
+ }
+ }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
new file mode 100644
index 0000000..e933d0c
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.AbstractTikaConfigTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+public class TestMetadataFilter extends AbstractTikaConfigTest {
+
+ @Test
+ public void testDefault() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set("title", "title");
+ metadata.set("author", "author");
+
+ MetadataFilter defaultFilter = new DefaultMetadataFilter();
+ defaultFilter.filter(metadata);
+
+ assertEquals(2, metadata.names().length);
+ assertEquals("title", metadata.get("title"));
+ assertEquals("author", metadata.get("author"));
+ }
+
+ @Test
+ public void testIncludeFilter() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set("title", "title");
+ metadata.set("author", "author");
+
+ MetadataFilter filter = new IncludeFieldMetadataFilter(set("title"));
+ filter.filter(metadata);
+ assertEquals(1, metadata.names().length);
+ assertEquals("title", metadata.get("title"));
+ assertNull(metadata.get("author"));
+ }
+
+ @Test
+ public void testExcludeFilter() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set("title", "title");
+ metadata.set("author", "author");
+
+ MetadataFilter filter = new ExcludeFieldMetadataFilter(set("title"));
+ filter.filter(metadata);
+ assertEquals(1, metadata.names().length);
+ assertEquals("author", metadata.get("author"));
+ assertNull(metadata.get("title"));
+ }
+
+ @Test
+ public void testConfigIncludeFilter() throws Exception {
+ TikaConfig config = getConfig("TIKA-3137-include.xml");
+ Metadata metadata = new Metadata();
+ metadata.set("title", "title");
+ metadata.set("author", "author");
+ metadata.set("content", "content");
+
+ config.getMetadataFilter().filter(metadata);
+
+ assertEquals(2, metadata.size());
+ assertEquals("title", metadata.get("title"));
+ assertEquals("author", metadata.get("author"));
+ }
+
+ @Test
+ public void testConfigExcludeFilter() throws Exception {
+ TikaConfig config = getConfig("TIKA-3137-exclude.xml");
+ Metadata metadata = new Metadata();
+ metadata.set("title", "title");
+ metadata.set("author", "author");
+ metadata.set("content", "content");
+
+ config.getMetadataFilter().filter(metadata);
+
+ assertEquals(1, metadata.size());
+ assertEquals("content", metadata.get("content"));
+ }
+
+ @Test
+ public void testConfigIncludeAndUCFilter() throws Exception {
+ TikaConfig config = getConfig("TIKA-3137-include-uc.xml");
+ String[] expectedTitles = new String[]{
+ "TITLE1", "TITLE2", "TITLE3"
+ };
+ Metadata metadata = new Metadata();
+ metadata.add("title", "title1");
+ metadata.add("title", "title2");
+ metadata.add("title", "title3");
+ metadata.set("author", "author");
+ metadata.set("content", "content");
+
+ config.getMetadataFilter().filter(metadata);
+
+ assertEquals(2, metadata.size());
+ assertArrayEquals(expectedTitles, metadata.getValues("title"));
+ assertEquals("AUTHOR", metadata.get("author"));
+ }
+
+ @Test
+ public void testMimeClearingFilter() throws Exception {
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, MediaType.image("jpeg").toString());
+ metadata.set("author", "author");
+
+ MetadataFilter filter = new ClearByMimeMetadataFilter(set("image/jpeg","application/pdf"));
+ filter.filter(metadata);
+ assertEquals(0, metadata.size());
+
+ metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString());
+ metadata.set("author", "author");
+ filter.filter(metadata);
+ assertEquals(2, metadata.size());
+ assertEquals("author", metadata.get("author"));
+
+ }
+
+ @Test
+ public void testMimeClearingFilterConfig() throws Exception {
+ TikaConfig config = getConfig("TIKA-3137-mimes-uc.xml");
+
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_TYPE, MediaType.image("jpeg").toString());
+ metadata.set("author", "author");
+
+ MetadataFilter filter = config.getMetadataFilter();
+ filter.filter(metadata);
+ debug(metadata);
+ assertEquals(0, metadata.size());
+
+ metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString());
+ metadata.set("author", "author");
+ filter.filter(metadata);
+ assertEquals(2, metadata.size());
+ assertEquals("AUTHOR", metadata.get("author"));
+
+ }
+
+ private static Set<String> set(String ... items) {
+ Set<String> set = new HashSet<>();
+ for (String item : items) {
+ set.add(item);
+ }
+ return set;
+ }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
new file mode 100644
index 0000000..27517f6
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.ExcludeFieldMetadataFilter">
+ <params>
+ <param name="exclude" type="string">title,author</param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
new file mode 100644
index 0000000..e0df476
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+ <params>
+ <param name="include" type="string">title,author</param>
+ </params>
+ </metadataFilter>
+ <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/>
+ </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
new file mode 100644
index 0000000..e92dff8
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+ <params>
+ <param name="include" type="string">title,author</param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
new file mode 100644
index 0000000..486280c
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
+ <params>
+ <param name="mimes" type="string">image/jpeg,application/pdf</param>
+ </params>
+ </metadataFilter>
+ <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/>
+ </metadataFilters>
+</properties>
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index a5182c6..349f271 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
import java.io.IOException;
import java.io.InputStream;
@@ -30,6 +31,7 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.ClosedInputStream;
import org.apache.tika.io.ProxyInputStream;
@@ -365,6 +367,47 @@ public class RecursiveParserWrapperTest extends TikaTest {
}
+ @Test
+ public void testIncludeFilter() throws Exception {
+ //TIKA-3137
+ ParseContext context = new ParseContext();
+ Metadata metadata = new Metadata();
+ TikaConfig tikaConfig = new TikaConfig(getClass().getResourceAsStream("TIKA-3137-include.xml"));
+ Parser p = new AutoDetectParser(tikaConfig);
+ RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, true);
+ String path = "/test-documents/test_recursive_embedded.docx";
+ ContentHandlerFactory contentHandlerFactory =
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+ -1);
+
+ RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
+ -1, tikaConfig.getMetadataFilter());
+ try (InputStream is = getClass().getResourceAsStream(path)) {
+ wrapper.parse(is, handler, metadata, context);
+ }
+ List<Metadata> metadataList = handler.getMetadataList();
+ assertEquals(5, metadataList.size());
+
+ Set<String> expectedKeys = new HashSet<>();
+ expectedKeys.add("X-TIKA:content");
+ expectedKeys.add("extended-properties:Application");
+ expectedKeys.add("Content-Type");
+ for (Metadata m : metadataList) {
+ if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) {
+ fail("emf should have been filtered out");
+ }
+ if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) {
+ fail("text/plain should have been filtered out");
+ }
+ assertTrue(m.names().length >= 2);
+ for (String n : m.names()) {
+ if (! expectedKeys.contains(n)) {
+ fail("didn't expect "+n);
+ }
+ }
+ }
+ }
+
private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
boolean catchEmbeddedExceptions,
DigestingParser.Digester digester) throws Exception {
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
new file mode 100644
index 0000000..765bc11
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+ <params>
+ <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param>
+ </params>
+ </metadataFilter>
+ <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
+ <params>
+ <param name="mimes" type="string">image/emf,text/plain</param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+</properties>
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 07d20c5..71e7180 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -152,7 +152,8 @@ public class RecursiveMetadataResource {
BasicContentHandlerFactory.HANDLER_TYPE type =
BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources);
+ new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources,
+ TikaResource.getConfig().getMetadataFilter());
try {
TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
} catch (SecurityException e) {
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 92c9d34..8b5f153 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -90,7 +90,8 @@ public abstract class CXFTestBase {
@Before
public void setUp() throws Exception {
- this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
+
+ this.tika = new TikaConfig(getTikaConfigInputStream());
TikaResource.init(tika,
new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
new DefaultInputStreamFactory(), new ServerStatus(true));
@@ -120,6 +121,10 @@ public abstract class CXFTestBase {
server = sf.create();
}
+ protected InputStream getTikaConfigInputStream() {
+ return getClass().getResourceAsStream("tika-config-for-server-tests.xml");
+ }
+
/**
* Have the test do {@link JAXRSServerFactoryBean#setResourceClasses(Class...)}
* and {@link JAXRSServerFactoryBean#setResourceProvider(Class, org.apache.cxf.jaxrs.lifecycle.ResourceProvider)}
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
new file mode 100644
index 0000000..748ee77
--- /dev/null
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.server.resource.RecursiveMetadataResource;
+import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
+import org.junit.Test;
+
+import javax.ws.rs.core.Response;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.TikaTest.assertNotContained;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class RecursiveMetadataFilterTest extends CXFTestBase {
+
+ private static final String META_PATH = "/rmeta";
+
+ private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return getClass().getResourceAsStream("TIKA-3137-include.xml");
+ }
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(RecursiveMetadataResource.class);
+ sf.setResourceProvider(RecursiveMetadataResource.class,
+ new SingletonResourceProvider(new RecursiveMetadataResource()));
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new MetadataListMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+ @Test
+ public void testBasicFilter() throws Exception {
+ Response response = WebClient
+ .create(endPoint + META_PATH)
+ .accept("application/json")
+ .acceptEncoding("gzip")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+ assertEquals(5, metadataList.size());
+
+ Set<String> expectedKeys = new HashSet<>();
+ expectedKeys.add("X-TIKA:content");
+ expectedKeys.add("extended-properties:Application");
+ expectedKeys.add("Content-Type");
+ for (Metadata m : metadataList) {
+ if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) {
+ fail("emf should have been filtered out");
+ }
+ if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) {
+ fail("text/plain should have been filtered out");
+ }
+ assertTrue(m.names().length >= 2);
+ for (String n : m.names()) {
+ if (! expectedKeys.contains(n)) {
+ fail("didn't expect "+n);
+ }
+ }
+ }
+ }
+}
diff --git a/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
new file mode 100644
index 0000000..765bc11
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+ <params>
+ <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param>
+ </params>
+ </metadataFilter>
+ <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
+ <params>
+ <param name="mimes" type="string">image/emf,text/plain</param>
+ </params>
+ </metadataFilter>
+ </metadataFilters>
+</properties>