You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/07/17 17:47:03 UTC

[tika] branch branch_1x updated (6686a6f -> d2aa1ac)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 6686a6f  fix for TIKA-3139 contributed by wiwi (#328)
     new 9f441f5  TIKA-3129 -- add a status endpoint to report server status.  Users must turn it on via the commandline -status option.
     new db4498d  TIKA-3137 -- first pass, need to add unit tests for tika-batch
     new 3388d28  TIKA-3140 -- initial commit
     new ae21558  fix merge conflicts
     new 096a4ad  TIKA-3137 add a list type for Param/configuration to avoid the comma-delimited lists which will get huge and ugly and were a bad idea.
     new d2aa1ac  fix merge conflicts

The 6 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   4 +-
 .../batch/fs/RecursiveParserWrapperFSConsumer.java |   9 +-
 .../tika/batch/fs/StreamOutRPWFSConsumer.java      |  20 ++-
 .../fs/builders/BasicTikaFSConsumersBuilder.java   |  11 +-
 .../RecursiveParserWrapperFSConsumerTest.java      |   5 +-
 .../main/java/org/apache/tika/config/Param.java    |  89 +++++++++--
 .../java/org/apache/tika/config/TikaConfig.java    | 139 +++++++++++++++--
 .../metadata/filter/ClearByMimeMetadataFilter.java |  72 +++++++++
 .../metadata/filter/CompositeMetadataFilter.java   |  23 +--
 .../filter/DefaultMetadataFilter.java}             |  38 +++--
 .../filter/ExcludeFieldMetadataFilter.java         |  41 +++--
 .../filter/IncludeFieldMetadataFilter.java}        |  54 +++----
 .../tika/metadata/filter/MetadataFilter.java       |  15 +-
 .../apache/tika/metadata/filter/NoOpFilter.java    |  21 ++-
 .../tika/sax/RecursiveParserWrapperHandler.java    |  31 +++-
 .../org.apache.tika.metadata.filter.MetadataFilter |   2 +-
 .../java/org/apache/tika/config/ParamTest.java     |   7 +
 .../org/apache/tika/config/TikaConfigTest.java     |   2 +
 .../tika/metadata/filter/MockUpperCaseFilter.java  |  27 ++--
 .../tika/metadata/filter/TestMetadataFilter.java   | 170 +++++++++++++++++++++
 .../tika/parser/ParameterizedParserTest.java       |   3 +-
 ...3-vowel-parser-ae.xml => TIKA-3137-exclude.xml} |  13 +-
 ...owel-parser-ae.xml => TIKA-3137-include-uc.xml} |  14 +-
 ...3-vowel-parser-ae.xml => TIKA-3137-include.xml} |  13 +-
 ...-vowel-parser-ae.xml => TIKA-3137-mimes-uc.xml} |  14 +-
 .../tika/eval/metadata/TikaEvalMetadataFilter.java | 104 +++++++++++++
 .../eval/metadata/TikaEvalMetadataFilterTest.java  |  51 +++++++
 .../tika/parser/RecursiveParserWrapperTest.java    |  43 ++++++
 .../org/apache/tika/parser/TIKA-3137-include.xml   |  22 ++-
 .../java/org/apache/tika/server/ServerStatus.java  |   9 +-
 .../java/org/apache/tika/server/TikaServerCli.java |   9 ++
 .../server/resource/RecursiveMetadataResource.java |   3 +-
 .../{TikaVersion.java => TikaServerStatus.java}    |  27 ++--
 ...ONMessageBodyWriter.java => JSONObjWriter.java} |  33 ++--
 .../java/org/apache/tika/server/CXFTestBase.java   |   7 +-
 .../tika/server/RecursiveMetadataFilterTest.java   | 108 +++++++++++++
 .../apache/tika/server/TikaServerStatusTest.java   |  56 +++++++
 .../org/apache/tika/server/TIKA-3137-include.xml   |  22 ++-
 38 files changed, 1121 insertions(+), 210 deletions(-)
 create mode 100644 tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
 copy tika-server/src/main/java/org/apache/tika/server/MetadataList.java => tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java (65%)
 copy tika-core/src/main/java/org/apache/tika/{mime/OrClause.java => metadata/filter/DefaultMetadataFilter.java} (51%)
 copy tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java => tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java (53%)
 copy tika-core/src/main/java/org/apache/tika/{detect/NonDetectingEncodingDetector.java => metadata/filter/IncludeFieldMetadataFilter.java} (51%)
 copy tika-parsers/src/main/java/org/apache/tika/parser/utils/DataURISchemeParseException.java => tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java (74%)
 copy tika-server/src/main/java/org/apache/tika/server/MetadataList.java => tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java (69%)
 copy tika-parsers/src/main/resources/org/apache/tika/parser/ner/nltk/NLTKServer.properties => tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter (94%)
 copy tika-server/src/main/java/org/apache/tika/server/DefaultInputStreamFactory.java => tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java (61%)
 create mode 100644 tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
 copy tika-core/src/test/resources/org/apache/tika/config/{TIKA-2653-vowel-parser-ae.xml => TIKA-3137-exclude.xml} (76%)
 copy tika-core/src/test/resources/org/apache/tika/config/{TIKA-2653-vowel-parser-ae.xml => TIKA-3137-include-uc.xml} (70%)
 copy tika-core/src/test/resources/org/apache/tika/config/{TIKA-2653-vowel-parser-ae.xml => TIKA-3137-include.xml} (76%)
 copy tika-core/src/test/resources/org/apache/tika/config/{TIKA-2653-vowel-parser-ae.xml => TIKA-3137-mimes-uc.xml} (70%)
 create mode 100644 tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
 create mode 100644 tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
 copy tika-core/src/test/resources/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml => tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml (58%)
 copy tika-server/src/main/java/org/apache/tika/server/resource/{TikaVersion.java => TikaServerStatus.java} (58%)
 copy tika-server/src/main/java/org/apache/tika/server/writer/{JSONMessageBodyWriter.java => JSONObjWriter.java} (74%)
 create mode 100644 tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
 create mode 100644 tika-server/src/test/java/org/apache/tika/server/TikaServerStatusTest.java
 copy tika-core/src/test/resources/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml => tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml (58%)


[tika] 06/06: fix merge conflicts

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d2aa1ac2613e1ac6b7ed0d5abb7db5d8650a275c
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 17 13:46:44 2020 -0400

    fix merge conflicts
---
 .../main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
index 2c69801..b33b041 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
@@ -38,7 +38,7 @@ import java.util.Map;
 
 public class TikaEvalMetadataFilter implements MetadataFilter {
 
-    public static String TIKA_EVAL_NS = "tika-eval"+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+    public static String TIKA_EVAL_NS = "tika-eval" + Metadata.NAMESPACE_PREFIX_DELIMITER;
 
     public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numTokens");
 


[tika] 01/06: TIKA-3129 -- add a status endpoint to report server status. Users must turn it on via the commandline -status option.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 9f441f51667b9d404650034227c758c68712352d
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 17 13:01:43 2020 -0400

    TIKA-3129 -- add a status endpoint to report server status.  Users must turn it on
    via the commandline -status option.
---
 .../java/org/apache/tika/server/ServerStatus.java  |  9 ++-
 .../java/org/apache/tika/server/TikaServerCli.java |  9 +++
 .../tika/server/resource/TikaServerStatus.java     | 44 +++++++++++++++
 .../apache/tika/server/writer/JSONObjWriter.java   | 64 ++++++++++++++++++++++
 .../apache/tika/server/TikaServerStatusTest.java   | 56 +++++++++++++++++++
 5 files changed, 181 insertions(+), 1 deletion(-)

diff --git a/tika-server/src/main/java/org/apache/tika/server/ServerStatus.java b/tika-server/src/main/java/org/apache/tika/server/ServerStatus.java
index 255ce70..32d74cf 100644
--- a/tika-server/src/main/java/org/apache/tika/server/ServerStatus.java
+++ b/tika-server/src/main/java/org/apache/tika/server/ServerStatus.java
@@ -81,6 +81,8 @@ public class ServerStatus {
     private final boolean isLegacy;
     private STATUS status = STATUS.OPERATING;
 
+    private volatile long lastStarted = Instant.now().toEpochMilli();
+
     public ServerStatus() {
         isLegacy = false;
     }
@@ -91,7 +93,9 @@ public class ServerStatus {
 
     public synchronized long start(TASK task, String fileName) {
         long taskId = counter.incrementAndGet();
-        tasks.put(taskId, new TaskStatus(task, Instant.now(), fileName));
+        Instant now = Instant.now();
+        lastStarted = now.toEpochMilli();
+        tasks.put(taskId, new TaskStatus(task, now, fileName));
         return taskId;
     }
 
@@ -126,6 +130,9 @@ public class ServerStatus {
         return counter.get();
     }
 
+    public long getMillisSinceLastParseStarted() {
+        return Instant.now().toEpochMilli()-lastStarted;
+    }
     /**
      *
      * @return true if this is legacy, otherwise whether or not status == OPERATING.
diff --git a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
index 10616cd..d1b6baf 100644
--- a/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
+++ b/tika-server/src/main/java/org/apache/tika/server/TikaServerCli.java
@@ -53,12 +53,14 @@ import org.apache.tika.server.resource.TikaDetectors;
 import org.apache.tika.server.resource.TikaMimeTypes;
 import org.apache.tika.server.resource.TikaParsers;
 import org.apache.tika.server.resource.TikaResource;
+import org.apache.tika.server.resource.TikaServerStatus;
 import org.apache.tika.server.resource.TikaVersion;
 import org.apache.tika.server.resource.TikaWelcome;
 import org.apache.tika.server.resource.TranslateResource;
 import org.apache.tika.server.resource.UnpackerResource;
 import org.apache.tika.server.writer.CSVMessageBodyWriter;
 import org.apache.tika.server.writer.JSONMessageBodyWriter;
+import org.apache.tika.server.writer.JSONObjWriter;
 import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
 import org.apache.tika.server.writer.TarWriter;
 import org.apache.tika.server.writer.TextMessageBodyWriter;
@@ -102,6 +104,7 @@ public class TikaServerCli {
         options.addOption("dml", "digestMarkLimit", true, "max number of bytes to mark on stream for digest");
         options.addOption("l", "log", true, "request URI log level ('debug' or 'info')");
         options.addOption("s", "includeStack", false, "whether or not to return a stack trace\nif there is an exception during 'parse'");
+        options.addOption("status", false, "enable the status endpoint");
         options.addOption("?", "help", false, "this help message");
         options.addOption("enableUnsecureFeatures", false, "this is required to enable fileUrl.");
         options.addOption("enableFileUrl", false, "allows user to pass in fileUrl instead of InputStream.");
@@ -305,6 +308,9 @@ public class TikaServerCli {
             rCoreProviders.add(new SingletonResourceProvider(new TikaDetectors()));
             rCoreProviders.add(new SingletonResourceProvider(new TikaParsers()));
             rCoreProviders.add(new SingletonResourceProvider(new TikaVersion()));
+            if (line.hasOption("status")) {
+                rCoreProviders.add(new SingletonResourceProvider(new TikaServerStatus(serverStatus)));
+            }
             List<ResourceProvider> rAllProviders = new ArrayList<>(rCoreProviders);
             rAllProviders.add(new SingletonResourceProvider(new TikaWelcome(rCoreProviders)));
             sf.setResourceProviders(rAllProviders);
@@ -318,6 +324,9 @@ public class TikaServerCli {
             providers.add(new XMPMessageBodyWriter());
             providers.add(new TextMessageBodyWriter());
             providers.add(new TikaServerParseExceptionMapper(returnStackTrace));
+            if (line.hasOption("status")) {
+                providers.add(new JSONObjWriter());
+            }
             if (logFilter != null) {
                 providers.add(logFilter);
             }
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaServerStatus.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaServerStatus.java
new file mode 100644
index 0000000..2e55221
--- /dev/null
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaServerStatus.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.server.resource;
+
+import org.apache.tika.server.ServerStatus;
+
+import javax.ws.rs.GET;
+import javax.ws.rs.Path;
+import javax.ws.rs.Produces;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+@Path("/status")
+public class TikaServerStatus {
+    private final ServerStatus serverStatus;
+
+    public TikaServerStatus(ServerStatus serverStatus) {
+        this.serverStatus = serverStatus;
+    }
+
+    @GET
+    @Produces("application/json")
+    public Map<String, Object> getStatus() {
+        Map<String, Object> map = new LinkedHashMap<>();
+        map.put("status", serverStatus.getStatus());
+        map.put("millis_since_last_parse_started", serverStatus.getMillisSinceLastParseStarted());
+        map.put("files_processed", serverStatus.getFilesProcessed());
+        return map;
+    }
+}
diff --git a/tika-server/src/main/java/org/apache/tika/server/writer/JSONObjWriter.java b/tika-server/src/main/java/org/apache/tika/server/writer/JSONObjWriter.java
new file mode 100644
index 0000000..08851d6
--- /dev/null
+++ b/tika-server/src/main/java/org/apache/tika/server/writer/JSONObjWriter.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.writer;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+
+import javax.ws.rs.Produces;
+import javax.ws.rs.WebApplicationException;
+import javax.ws.rs.core.MediaType;
+import javax.ws.rs.core.MultivaluedMap;
+import javax.ws.rs.ext.MessageBodyWriter;
+import javax.ws.rs.ext.Provider;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.lang.annotation.Annotation;
+import java.lang.reflect.Type;
+import java.util.Map;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+@Provider
+@Produces(MediaType.APPLICATION_JSON)
+public class JSONObjWriter implements MessageBodyWriter<Map<String, Object>> {
+    private static Gson GSON = new GsonBuilder().setPrettyPrinting().create();
+
+    public boolean isWriteable(Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
+        return Map.class.isAssignableFrom(type);
+    }
+
+    public long getSize(Metadata data, Class<?> type, Type genericType, Annotation[] annotations, MediaType mediaType) {
+        return -1;
+    }
+
+    @Override
+    public void writeTo(Map<String, Object> map, Class<?> type, Type genericType, Annotation[] annotations,
+                        MediaType mediaType, MultivaluedMap<String, Object> httpHeaders, OutputStream entityStream)
+            throws IOException, WebApplicationException {
+        Writer writer = new OutputStreamWriter(entityStream, UTF_8);
+        GSON.toJson(map, writer);
+        writer.flush();
+        entityStream.flush();
+    }
+}
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaServerStatusTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaServerStatusTest.java
new file mode 100644
index 0000000..28e62e7
--- /dev/null
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaServerStatusTest.java
@@ -0,0 +1,56 @@
+package org.apache.tika.server;
+
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParser;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.server.resource.RecursiveMetadataResource;
+import org.apache.tika.server.resource.TikaResource;
+import org.apache.tika.server.resource.TikaServerStatus;
+import org.apache.tika.server.writer.JSONMessageBodyWriter;
+import org.apache.tika.server.writer.JSONObjWriter;
+import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
+import org.junit.Test;
+
+import javax.ws.rs.core.Response;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.List;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TikaServerStatusTest extends CXFTestBase {
+
+    private final static String STATUS_PATH = "/status";
+
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        sf.setResourceClasses(TikaServerStatus.class);
+        sf.setResourceProvider(TikaServerStatus.class,
+                new SingletonResourceProvider(new TikaServerStatus(new ServerStatus())));
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new JSONObjWriter());
+        sf.setProviders(providers);
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        Response response = WebClient.create(endPoint + STATUS_PATH).get();
+        String jsonString =
+                getStringFromInputStream((InputStream) response.getEntity());
+        JsonObject root = JsonParser.parseString(jsonString).getAsJsonObject();
+        assertTrue(root.has("status"));
+        assertTrue(root.has("millis_since_last_parse_started"));
+        assertTrue(root.has("files_processed"));
+        assertEquals("OPERATING", root.getAsJsonPrimitive("status").getAsString());
+        assertEquals(0, root.getAsJsonPrimitive("files_processed").getAsInt());
+        long millis = root.getAsJsonPrimitive("millis_since_last_parse_started").getAsInt();
+        assertTrue(millis > 0 && millis < 360000);
+    }
+}


[tika] 04/06: fix merge conflicts

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit ae21558ad30a64480fdcf935495ae5c9389fc57a
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 17 11:55:32 2020 -0400

    fix merge conflicts
---
 .../main/java/org/apache/tika/config/Param.java    | 89 ++++++++++++++++++----
 .../java/org/apache/tika/config/TikaConfig.java    | 31 +++++---
 .../metadata/filter/ClearByMimeMetadataFilter.java |  8 +-
 .../filter/ExcludeFieldMetadataFilter.java         | 15 ++--
 .../filter/IncludeFieldMetadataFilter.java         |  8 +-
 .../java/org/apache/tika/config/ParamTest.java     |  7 ++
 .../tika/parser/ParameterizedParserTest.java       |  3 +-
 .../org/apache/tika/config/TIKA-3137-exclude.xml   |  5 +-
 .../apache/tika/config/TIKA-3137-include-uc.xml    |  5 +-
 .../org/apache/tika/config/TIKA-3137-include.xml   |  5 +-
 .../org/apache/tika/config/TIKA-3137-mimes-uc.xml  |  5 +-
 .../org/apache/tika/parser/TIKA-3137-include.xml   |  5 +-
 12 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java b/tika-core/src/main/java/org/apache/tika/config/Param.java
index 112955b..73e2bd9 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Param.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Param.java
@@ -21,6 +21,7 @@ import org.apache.tika.utils.XMLReaderUtils;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 
 import javax.xml.parsers.DocumentBuilder;
@@ -39,7 +40,9 @@ import java.lang.reflect.InvocationTargetException;
 import java.math.BigInteger;
 import java.net.URI;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 
 
@@ -51,8 +54,10 @@ import java.util.Map;
  */
 public class Param<T> implements Serializable {
 
+    private static final String LIST = "list";
     private static final Map<Class<?>, String> map = new HashMap<>();
     private static final Map<String, Class<?>> reverseMap = new HashMap<>();
+    private static final Map<String, Class<?>> wellKnownMap = new HashMap<>();
 
     static {
         map.put(Boolean.class, "bool");
@@ -67,26 +72,36 @@ public class Param<T> implements Serializable {
         map.put(File.class, "file");
         map.put(URI.class, "uri");
         map.put(URL.class, "url");
+        map.put(ArrayList.class, LIST);
         for (Map.Entry<Class<?>, String> entry : map.entrySet()) {
             reverseMap.put(entry.getValue(), entry.getKey());
         }
+        //wellKnownMap.put("metadataPolicy", AbstractMultipleParser.MetadataPolicy.class);
     }
 
     private Class<T> type;
 
     private String name;
 
-    private String value;
+    private List<String> valueStrings = new ArrayList<>();
 
     private T actualValue;
 
     public Param(){
     }
 
-    public Param(String name, Class<T> type, T value){
+    public Param(String name, Class<T> type, T value) {
         this.name = name;
         this.type = type;
-        this.value = value.toString();
+        this.actualValue = value;
+        if (List.class.isAssignableFrom(value.getClass())) {
+            this.valueStrings.addAll((List)value);
+        } else {
+            this.valueStrings.add(value.toString());
+        }
+        if (this.type == null) {
+            this.type = (Class<T>)wellKnownMap.get(name);
+        }
     }
 
     public Param(String name, T value){
@@ -113,6 +128,9 @@ public class Param<T> implements Serializable {
         if (type == null) {
             return null;
         }
+        if (List.class.isAssignableFrom(type)) {
+            return LIST;
+        }
         if (map.containsKey(type)){
             return map.get(type);
         }
@@ -129,9 +147,6 @@ public class Param<T> implements Serializable {
     }
 
     public T getValue(){
-        if (actualValue == null) {
-            actualValue = getTypedValue(type, value);
-        }
         return actualValue;
     }
 
@@ -139,7 +154,7 @@ public class Param<T> implements Serializable {
     public String toString() {
         return "Param{" +
                 "name='" + name + '\'' +
-                ", value='" + value + '\'' +
+                ", valueStrings='" + valueStrings + '\'' +
                 ", actualValue=" + actualValue +
                 '}';
     }
@@ -152,13 +167,13 @@ public class Param<T> implements Serializable {
         Element paramEl = doc.createElement("param");
         doc.appendChild(paramEl);
         
-        save(paramEl);
+        save(doc, paramEl);
         
         Transformer transformer = XMLReaderUtils.getTransformer();
         transformer.transform(new DOMSource(paramEl), new StreamResult(stream));
     }
 
-    public void save(Node node) {
+    public void save(Document doc, Node node) {
 
         if ( !(node instanceof Element) ) {
             throw new IllegalArgumentException("Not an Element : " + node);
@@ -168,7 +183,17 @@ public class Param<T> implements Serializable {
         
         el.setAttribute("name",  getName());
         el.setAttribute("type", getTypeString());
-        el.setTextContent(value);
+        if (List.class.isAssignableFrom(actualValue.getClass())) {
+            for (int i = 0; i < valueStrings.size(); i++) {
+                String val = valueStrings.get(i);
+                String typeString = map.get(((List)actualValue).get(i).getClass());
+                Node item = doc.createElement(typeString);
+                item.setTextContent(val);
+                el.appendChild(item);
+            }
+        } else {
+            el.setTextContent(valueStrings.get(0));
+        }
     }
 
     public static <T> Param<T> load(InputStream stream) throws SAXException, IOException, TikaException {
@@ -179,20 +204,49 @@ public class Param<T> implements Serializable {
         return load(document.getFirstChild());
     }
 
-    public static <T> Param<T> load(Node node)  {
+    public static <T> Param<T> load(Node node) {
         
         Node nameAttr = node.getAttributes().getNamedItem("name");
         Node typeAttr = node.getAttributes().getNamedItem("type");
+        Node valueAttr = node.getAttributes().getNamedItem("value");
         Node value = node.getFirstChild();
+        if (value instanceof NodeList && valueAttr != null) {
+            throw new IllegalArgumentException("can't specify a value attr _and_ a node list");
+        }
+        if (valueAttr != null && (value == null || value.getTextContent() == null)) {
+            value = valueAttr;
+        }
         
         Param<T> ret = new Param<T>();
         ret.name  = nameAttr.getTextContent();
-        ret.setTypeString(typeAttr.getTextContent());
-        ret.value = value.getTextContent();
-        
+        if (typeAttr != null) {
+            ret.setTypeString(typeAttr.getTextContent());
+        } else {
+            ret.type = (Class<T>)wellKnownMap.get(ret.name);
+        }
+
+        if (List.class.isAssignableFrom(ret.type)) {
+            loadList(ret, node);
+        } else {
+            ret.actualValue = getTypedValue(ret.type, value.getTextContent());
+            ret.valueStrings.add(value.getTextContent());
+        }
         return ret;
     }
-    
+
+    private static <T> void loadList(Param<T> ret, Node root) {
+        Node child = root.getFirstChild();
+        ret.actualValue = (T)new ArrayList<>();
+        while (child != null) {
+            if (child.getNodeType() == Node.ELEMENT_NODE) {
+                Class type = classFromType(child.getLocalName());
+                ((List) ret.actualValue).add(getTypedValue(type, child.getTextContent()));
+                ret.valueStrings.add(child.getTextContent());
+            }
+            child = child.getNextSibling();
+        }
+    }
+
     private static <T> Class<T> classFromType(String type) {
         if (reverseMap.containsKey(type)){
             return (Class<T>) reverseMap.get(type);
@@ -205,6 +259,11 @@ public class Param<T> implements Serializable {
     
     private static <T> T getTypedValue(Class<T> type, String value) {
         try {
+            if (type.isEnum()) {
+                Object val = Enum.valueOf((Class)type, value);
+                return (T)val;
+            }
+            
             Constructor<T> constructor = type.getConstructor(String.class);
             constructor.setAccessible(true);
             return constructor.newInstance(value);
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index a0cc102..18b3add 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -573,8 +573,8 @@ public class TikaConfig {
         abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader);
         abstract CT createComposite(List<T> loaded, MimeTypes mimeTypes, ServiceLoader loader);
         abstract T createComposite(Class<? extends T> compositeClass, 
-                List<T> children, Set<Class<? extends T>> excludeChildren, 
-                MimeTypes mimeTypes, ServiceLoader loader) 
+                List<T> children, Set<Class<? extends T>> excludeChildren,
+                Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
                 throws InvocationTargetException, IllegalAccessException, InstantiationException;
         abstract T decorate(T created, Element element) 
                 throws IOException, TikaException; // eg explicit mime types 
@@ -632,6 +632,14 @@ public class TikaConfig {
                 loaded = preLoadOne(loadedClass, name, mimeTypes);
                 if (loaded != null) return loaded;
                 
+                // Get any parameters / settings for the parser
+                Map<String, Param> params = null;
+                try {
+                    params = getParams(element);
+                } catch (Exception e) {
+                    throw new TikaConfigException(e.getMessage(), e);
+                }
+                
                 // Is this a composite or decorated class? If so, support recursion
                 if (isComposite(loadedClass)) {
                     // Get the child objects for it
@@ -657,7 +665,7 @@ public class TikaConfig {
                     }
                     
                     // Create the Composite
-                    loaded = createComposite(loadedClass, children, excludeChildren, mimeTypes, loader);
+                    loaded = createComposite(loadedClass, children, excludeChildren, params, mimeTypes, loader);
 
                     // Default constructor fallback
                     if (loaded == null) {
@@ -670,7 +678,6 @@ public class TikaConfig {
                     // See the thread "Configuring parsers and translators" for details 
                 }
 
-                Map<String, Param> params = getParams(element);
                 //Assigning the params to bean fields/setters
                 AnnotationUtils.assignFieldParams(loaded, params);
                 if (loaded instanceof Initializable) {
@@ -791,7 +798,7 @@ public class TikaConfig {
         @Override
         Parser createComposite(Class<? extends Parser> parserClass,
                 List<Parser> childParsers, Set<Class<? extends Parser>> excludeParsers,
-                MimeTypes mimeTypes, ServiceLoader loader) 
+                Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader) 
                 throws InvocationTargetException, IllegalAccessException, InstantiationException {
             Parser parser = null;
             Constructor<? extends Parser> c = null;
@@ -821,6 +828,12 @@ public class TikaConfig {
             }
             if (parser == null) {
                 try {
+                    c = parserClass.getConstructor(MediaTypeRegistry.class, Collection.class, Map.class);
+                    parser = c.newInstance(registry, childParsers, params);
+                } catch (NoSuchMethodException me) {}
+            }
+            if (parser == null) {
+                try {
                     c = parserClass.getConstructor(MediaTypeRegistry.class, List.class);
                     parser = c.newInstance(registry, childParsers);
                 } catch (NoSuchMethodException me) {}
@@ -914,7 +927,7 @@ public class TikaConfig {
         Detector createComposite(Class<? extends Detector> detectorClass,
                 List<Detector> childDetectors,
                 Set<Class<? extends Detector>> excludeDetectors,
-                MimeTypes mimeTypes, ServiceLoader loader)
+                Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
                 throws InvocationTargetException, IllegalAccessException,
                 InstantiationException {
             Detector detector = null;
@@ -987,7 +1000,7 @@ public class TikaConfig {
         Translator createComposite(Class<? extends Translator> compositeClass,
                 List<Translator> children,
                 Set<Class<? extends Translator>> excludeChildren,
-                MimeTypes mimeTypes, ServiceLoader loader)
+                Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
                 throws InvocationTargetException, IllegalAccessException,
                 InstantiationException {
             throw new InstantiationException("Only one translator supported");
@@ -1004,7 +1017,7 @@ public class TikaConfig {
                 Class<? extends ConfigurableThreadPoolExecutor> compositeClass,
                 List<ConfigurableThreadPoolExecutor> children,
                 Set<Class<? extends ConfigurableThreadPoolExecutor>> excludeChildren,
-                MimeTypes mimeTypes, ServiceLoader loader)
+                Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
                 throws InvocationTargetException, IllegalAccessException,
                 InstantiationException {
             throw new InstantiationException("Only one executor service supported");
@@ -1127,7 +1140,7 @@ public class TikaConfig {
         EncodingDetector createComposite(Class<? extends EncodingDetector> encodingDetectorClass,
                                          List<EncodingDetector> childEncodingDetectors,
                                          Set<Class<? extends EncodingDetector>> excludeDetectors,
-                                         MimeTypes mimeTypes, ServiceLoader loader)
+                                         Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
                 throws InvocationTargetException, IllegalAccessException,
                 InstantiationException {
             EncodingDetector encodingDetector = null;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
index 05324f2..80c3c86 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
@@ -63,12 +63,10 @@ public class ClearByMimeMetadataFilter implements MetadataFilter {
 
     /**
      *
-     * @param mimesString comma-delimited list of mimes that will trigger complete removal of metadata
+     * @param mimes list of mimes that will trigger complete removal of metadata
      */
     @Field
-    public void setMimes(String mimesString) {
-        for (String include : mimesString.split(",")) {
-            mimes.add(include);
-        }
+    public void setMimes(List<String> mimes) {
+            this.mimes.addAll(mimes);
     }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
index 3b6e2a0..71dc55b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
@@ -21,33 +21,32 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 public class ExcludeFieldMetadataFilter implements MetadataFilter {
-    private final Set<String> exclude;
+    private final Set<String> excludeSet;
 
     public ExcludeFieldMetadataFilter() {
         this(new HashSet<>());
     }
     public ExcludeFieldMetadataFilter(Set<String> exclude) {
-        this.exclude = exclude;
+        this.excludeSet = exclude;
     }
 
     @Override
     public void filter(Metadata metadata) throws TikaException {
-        for (String field : exclude) {
+        for (String field : excludeSet) {
             metadata.remove(field);
         }
     }
 
     /**
      *
-     * @param excludeString comma-delimited list of fields to exclude
+     * @param exclude list of fields to exclude
      */
     @Field
-    public void setExclude(String excludeString) {
-        for (String include : excludeString.split(",")) {
-            exclude.add(include);
-        }
+    public void setExclude(List<String> exclude) {
+        this.excludeSet.addAll(exclude);
     }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
index 4bc6c9e..d518ce5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
@@ -37,13 +37,11 @@ public class IncludeFieldMetadataFilter implements MetadataFilter {
 
     /**
      *
-     * @param includeString comma-delimited list of fields to include
+     * @param include comma-delimited list of fields to include
      */
     @Field
-    public void setInclude(String includeString) {
-        for (String include : includeString.split(",")) {
-            includeSet.add(include);
-        }
+    public void setInclude(List<String> include) {
+        includeSet.addAll(include);
     }
 
     @Override
diff --git a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java
index 7c9007e..416cd4a 100644
--- a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java
@@ -24,8 +24,10 @@ import java.io.File;
 import java.math.BigInteger;
 import java.net.URI;
 import java.net.URL;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.List;
 
 import static org.junit.Assert.*;
 
@@ -34,7 +36,12 @@ public class ParamTest {
     @Test
     public void testSaveAndLoad() throws Exception {
 
+        List<String> list = new ArrayList<>();
+        list.add("quick");
+        list.add("brown");
+        list.add("fox");
         Object objects [] =  {
+                list,
                 Integer.MAX_VALUE,
                 2.5f,
                 4000.57576,
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
index a9c9a41..8b3b599 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -99,12 +99,11 @@ public class ParameterizedParserTest {
 
     @Test
     public void testBadType() throws Exception {
-        //TODO: should this be a TikaConfigException instead of Runtime?
         boolean ex = false;
         try {
             Metadata m = getMetadata("TIKA-1986-bad-types.xml");
             fail("should have thrown exception");
-        } catch (RuntimeException e) {
+        } catch (TikaConfigException e) {
             ex = true;
         }
         assertTrue("No RuntimeException", ex);
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
index 27517f6..96dac44 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
@@ -19,7 +19,10 @@
   <metadataFilters>
     <metadataFilter class="org.apache.tika.metadata.filter.ExcludeFieldMetadataFilter">
       <params>
-        <param name="exclude" type="string">title,author</param>
+        <param name="exclude" type="list">
+          <string>title</string>
+          <string>author</string>
+        </param>
       </params>
     </metadataFilter>
   </metadataFilters>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
index e0df476..f960e94 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
@@ -19,7 +19,10 @@
   <metadataFilters>
     <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
       <params>
-        <param name="include" type="string">title,author</param>
+        <param name="include" type="list">
+          <string>title</string>
+          <string>author</string>
+        </param>
       </params>
     </metadataFilter>
     <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
index e92dff8..8832915 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
@@ -19,7 +19,10 @@
   <metadataFilters>
     <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
       <params>
-        <param name="include" type="string">title,author</param>
+        <param name="include" type="list">
+          <string>title</string>
+          <string>author</string>
+        </param>
       </params>
     </metadataFilter>
   </metadataFilters>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
index 486280c..a151665 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
@@ -19,7 +19,10 @@
   <metadataFilters>
     <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
       <params>
-        <param name="mimes" type="string">image/jpeg,application/pdf</param>
+        <param name="mimes" type="list">
+          <string>image/jpeg</string>
+          <string>application/pdf</string>
+        </param>
       </params>
     </metadataFilter>
     <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/>
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
index 765bc11..aae2f43 100644
--- a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
@@ -24,7 +24,10 @@
     </metadataFilter>
     <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
       <params>
-        <param name="mimes" type="string">image/emf,text/plain</param>
+        <param name="mimes" type="list">
+          <string>image/emf</string>
+          <string>text/plain</string>
+        </param>
       </params>
     </metadataFilter>
   </metadataFilters>


[tika] 05/06: TIKA-3137 add a list type for Param/configuration to avoid the comma-delimited lists which will get huge and ugly and were a bad idea.

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 096a4ad7f6ca7098a513138f5fc6338858efe07f
Author: tallison <ta...@apache.org>
AuthorDate: Fri Jul 17 12:56:35 2020 -0400

    TIKA-3137 add a list type for Param/configuration to avoid the comma-delimited lists
    which will get huge and ugly and were a bad idea.
---
 .../resources/org/apache/tika/parser/TIKA-3137-include.xml    |  6 +++++-
 .../org/apache/tika/server/RecursiveMetadataFilterTest.java   |  1 +
 .../resources/org/apache/tika/server/TIKA-3137-include.xml    | 11 +++++++++--
 3 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
index aae2f43..b99af0b 100644
--- a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
@@ -19,7 +19,11 @@
   <metadataFilters>
     <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
       <params>
-        <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param>
+        <param name="include" type="list">
+          <string>X-TIKA:content</string>
+          <string>extended-properties:Application</string>
+          <string>Content-Type</string>
+        </param>
       </params>
     </metadataFilter>
     <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
index 748ee77..9799f8b 100644
--- a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
@@ -59,6 +59,7 @@ public class RecursiveMetadataFilterTest extends CXFTestBase {
     protected InputStream getTikaConfigInputStream() {
         return getClass().getResourceAsStream("TIKA-3137-include.xml");
     }
+
     @Override
     protected void setUpResources(JAXRSServerFactoryBean sf) {
         sf.setResourceClasses(RecursiveMetadataResource.class);
diff --git a/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
index 765bc11..b99af0b 100644
--- a/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
+++ b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
@@ -19,12 +19,19 @@
   <metadataFilters>
     <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
       <params>
-        <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param>
+        <param name="include" type="list">
+          <string>X-TIKA:content</string>
+          <string>extended-properties:Application</string>
+          <string>Content-Type</string>
+        </param>
       </params>
     </metadataFilter>
     <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
       <params>
-        <param name="mimes" type="string">image/emf,text/plain</param>
+        <param name="mimes" type="list">
+          <string>image/emf</string>
+          <string>text/plain</string>
+        </param>
       </params>
     </metadataFilter>
   </metadataFilters>


[tika] 02/06: TIKA-3137 -- first pass, need to add unit tests for tika-batch

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit db4498d1de534f8348e94b0f27c641353a26b083
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 16 15:58:00 2020 -0400

    TIKA-3137 -- first pass, need to add unit tests for tika-batch
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   4 +-
 .../batch/fs/RecursiveParserWrapperFSConsumer.java |   9 +-
 .../tika/batch/fs/StreamOutRPWFSConsumer.java      |  20 ++-
 .../fs/builders/BasicTikaFSConsumersBuilder.java   |  11 +-
 .../RecursiveParserWrapperFSConsumerTest.java      |   5 +-
 .../java/org/apache/tika/config/TikaConfig.java    | 108 ++++++++++++-
 .../metadata/filter/ClearByMimeMetadataFilter.java |  74 +++++++++
 .../metadata/filter/CompositeMetadataFilter.java   |  38 +++++
 .../metadata/filter/DefaultMetadataFilter.java     |  46 ++++++
 .../filter/ExcludeFieldMetadataFilter.java         |  53 +++++++
 .../filter/IncludeFieldMetadataFilter.java         |  58 +++++++
 .../tika/metadata/filter/MetadataFilter.java       |  33 ++++
 .../apache/tika/metadata/filter/NoOpFilter.java    |  34 +++++
 .../tika/sax/RecursiveParserWrapperHandler.java    |  31 +++-
 .../org.apache.tika.metadata.filter.MetadataFilter |  16 ++
 .../org/apache/tika/config/TikaConfigTest.java     |   2 +
 .../tika/metadata/filter/MockUpperCaseFilter.java  |  39 +++++
 .../tika/metadata/filter/TestMetadataFilter.java   | 170 +++++++++++++++++++++
 .../org/apache/tika/config/TIKA-3137-exclude.xml   |  26 ++++
 .../apache/tika/config/TIKA-3137-include-uc.xml    |  27 ++++
 .../org/apache/tika/config/TIKA-3137-include.xml   |  26 ++++
 .../org/apache/tika/config/TIKA-3137-mimes-uc.xml  |  27 ++++
 .../tika/parser/RecursiveParserWrapperTest.java    |  43 ++++++
 .../org/apache/tika/parser/TIKA-3137-include.xml   |  31 ++++
 .../server/resource/RecursiveMetadataResource.java |   3 +-
 .../java/org/apache/tika/server/CXFTestBase.java   |   7 +-
 .../tika/server/RecursiveMetadataFilterTest.java   | 107 +++++++++++++
 .../org/apache/tika/server/TIKA-3137-include.xml   |  31 ++++
 28 files changed, 1062 insertions(+), 17 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index 8077114..46f82ee 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -513,7 +513,9 @@ public class TikaCLI {
     private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException {
         Metadata metadata = new Metadata();
         RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
-        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1);
+        RecursiveParserWrapperHandler handler =
+                new RecursiveParserWrapperHandler(getContentHandlerFactory(type),
+                        -1, config.getMetadataFilter());
         try (InputStream input = TikaInputStream.get(url, metadata)) {
             wrapper.parse(input, handler, metadata, context);
         }
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
index 56b8b58..9732781 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/RecursiveParserWrapperFSConsumer.java
@@ -32,6 +32,8 @@ import org.apache.tika.batch.ParserFactory;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -50,6 +52,7 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
     private final Parser parser;
     private final ContentHandlerFactory contentHandlerFactory;
     private final OutputStreamFactory fsOSFactory;
+    private final MetadataFilter metadataFilter;
     private String outputEncoding = "UTF-8";
 
     /**
@@ -62,11 +65,12 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
     public RecursiveParserWrapperFSConsumer(ArrayBlockingQueue<FileResource> queue,
                                             Parser parser,
                                             ContentHandlerFactory contentHandlerFactory,
-                                            OutputStreamFactory fsOSFactory) {
+                                            OutputStreamFactory fsOSFactory, MetadataFilter metadataFilter) {
         super(queue);
         this.contentHandlerFactory = contentHandlerFactory;
         this.fsOSFactory = fsOSFactory;
         this.parser = parser;
+        this.metadataFilter = metadataFilter;
     }
 
     @Override
@@ -95,7 +99,8 @@ public class RecursiveParserWrapperFSConsumer extends AbstractFSConsumer {
         Throwable thrown = null;
         List<Metadata> metadataList = null;
         Metadata containerMetadata = fileResource.getMetadata();
-        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory, -1);
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
+                -1, metadataFilter);
         try {
             parse(fileResource.getResourceId(), parser, is, handler,
                     containerMetadata, context);
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java
index 018c1a9..dd39a6c 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/StreamOutRPWFSConsumer.java
@@ -20,12 +20,15 @@ package org.apache.tika.batch.fs;
 
 
 import org.apache.commons.io.IOUtils;
+import org.apache.tika.Tika;
 import org.apache.tika.batch.FileResource;
 import org.apache.tika.batch.OutputStreamFactory;
 import org.apache.tika.batch.ParserFactory;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
 import org.apache.tika.metadata.serialization.JsonStreamingSerializer;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -53,17 +56,19 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
     private final Parser parser;
     private final ContentHandlerFactory contentHandlerFactory;
     private final OutputStreamFactory fsOSFactory;
+    private final MetadataFilter metadataFilter;
     private String outputEncoding = "UTF-8";
 
 
     public StreamOutRPWFSConsumer(ArrayBlockingQueue<FileResource> queue,
                                   Parser parser,
                                   ContentHandlerFactory contentHandlerFactory,
-                                  OutputStreamFactory fsOSFactory) {
+                                  OutputStreamFactory fsOSFactory, MetadataFilter metadataFilter) {
         super(queue);
         this.contentHandlerFactory = contentHandlerFactory;
         this.fsOSFactory = fsOSFactory;
         this.parser = parser;
+        this.metadataFilter = metadataFilter;
     }
 
     @Override
@@ -93,7 +98,8 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
         JsonStreamingSerializer writer = new JsonStreamingSerializer(
                 new OutputStreamWriter(os, StandardCharsets.UTF_8));
 
-        WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory, writer);
+        WriteoutRPWHandler handler = new WriteoutRPWHandler(contentHandlerFactory,
+                writer, metadataFilter);
         Throwable thrown = null;
         try {
             parse(fileResource.getResourceId(), parser, is, handler,
@@ -137,16 +143,24 @@ public class StreamOutRPWFSConsumer extends AbstractFSConsumer {
     //be written straight to disk.
     private class WriteoutRPWHandler extends AbstractRecursiveParserWrapperHandler {
         private final JsonStreamingSerializer jsonWriter;
+        private final MetadataFilter metadataFilter;
 
-        public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer) {
+        public WriteoutRPWHandler(ContentHandlerFactory contentHandlerFactory, JsonStreamingSerializer writer,
+                                  MetadataFilter metadataFilter) {
             super(contentHandlerFactory);
             this.jsonWriter = writer;
+            this.metadataFilter = metadataFilter;
         }
 
         @Override
         public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
             metadata.add(RecursiveParserWrapperHandler.TIKA_CONTENT, contentHandler.toString());
             try {
+                metadataFilter.filter(metadata);
+            } catch (TikaException e) {
+                throw new SAXException(e);
+            }
+            try {
                 jsonWriter.add(metadata);
             } catch (IOException e) {
                 throw new SAXException(e);
diff --git a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
index 88171ee..4f05324 100644
--- a/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
+++ b/tika-batch/src/main/java/org/apache/tika/batch/fs/builders/BasicTikaFSConsumersBuilder.java
@@ -42,6 +42,9 @@ import org.apache.tika.batch.fs.FSUtil;
 import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
 import org.apache.tika.batch.fs.StreamOutRPWFSConsumer;
 import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
 import org.apache.tika.sax.BasicContentHandlerFactory;
@@ -145,15 +148,19 @@ public class BasicTikaFSConsumersBuilder extends AbstractConsumersBuilder {
                 contentHandlerFactory, recursiveParserWrapper);
         Parser parser = parserFactory.getParser(config);
         if (recursiveParserWrapper) {
+            MetadataFilter metadataFilter = config.getMetadataFilter();
             parser = new RecursiveParserWrapper(parser);
+
             for (int i = 0; i < numConsumers; i++) {
                 FileResourceConsumer c = null;
                 if (streamOut){
                     c = new StreamOutRPWFSConsumer(queue,
-                            parser, contentHandlerFactory, outputStreamFactory);
+                            parser, contentHandlerFactory,
+                            outputStreamFactory, metadataFilter);
                 } else {
                     c = new RecursiveParserWrapperFSConsumer(queue,
-                            parser, contentHandlerFactory, outputStreamFactory);
+                            parser, contentHandlerFactory,
+                            outputStreamFactory, metadataFilter);
                 }
                 consumers.add(c);
             }
diff --git a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java
index 7ebe564..6a61414 100644
--- a/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java
+++ b/tika-batch/src/test/java/org/apache/tika/batch/RecursiveParserWrapperFSConsumerTest.java
@@ -34,6 +34,7 @@ import org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.NoOpFilter;
 import org.apache.tika.metadata.serialization.JsonMetadataList;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.RecursiveParserWrapper;
@@ -75,7 +76,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest {
         Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig()));
         RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(
                 queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
-                mockOSFactory);
+                mockOSFactory, NoOpFilter.NOOP_FILTER);
 
         IFileProcessorFutureResult result = consumer.call();
         mockOSFactory.getStreams().get(0).flush();
@@ -123,7 +124,7 @@ public class RecursiveParserWrapperFSConsumerTest extends TikaTest {
         Parser p = new RecursiveParserWrapper(new AutoDetectParserFactory().getParser(new TikaConfig()));
         RecursiveParserWrapperFSConsumer consumer = new RecursiveParserWrapperFSConsumer(
                 queue, p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
-                mockOSFactory);
+                mockOSFactory, NoOpFilter.NOOP_FILTER);
 
         IFileProcessorFutureResult result = consumer.call();
         mockOSFactory.getStreams().get(0).flush();
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 92485d3..a0cc102 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -50,6 +50,9 @@ import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.language.translate.DefaultTranslator;
 import org.apache.tika.language.translate.Translator;
+import org.apache.tika.metadata.filter.CompositeMetadataFilter;
+import org.apache.tika.metadata.filter.DefaultMetadataFilter;
+import org.apache.tika.metadata.filter.MetadataFilter;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.mime.MimeTypeException;
@@ -104,6 +107,10 @@ public class TikaConfig {
         return new SimpleThreadPoolExecutor();
     }
 
+    private static MetadataFilter getDefaultMetadataFilter(ServiceLoader loader) {
+        return new DefaultMetadataFilter(loader);
+    }
+
     //use this to look for unneeded instantiations of TikaConfig
     protected static AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
 
@@ -115,6 +122,7 @@ public class TikaConfig {
     private final MimeTypes mimeTypes;
     private final ExecutorService executorService;
     private final EncodingDetector encodingDetector;
+    private final MetadataFilter metadataFilter;
 
     public TikaConfig(String file)
             throws TikaException, IOException, SAXException {
@@ -180,6 +188,7 @@ public class TikaConfig {
         TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
         ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
         EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader();
+        MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader();
         updateXMLReaderUtils(element);
         this.mimeTypes = typesFromDomElement(element);
         this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
@@ -189,6 +198,7 @@ public class TikaConfig {
         this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
         this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
         this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
+        this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, loader);
         this.serviceLoader = loader;
         TIMES_INSTANTIATED.incrementAndGet();
     }
@@ -214,6 +224,7 @@ public class TikaConfig {
         this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
         this.translator = getDefaultTranslator(serviceLoader);
         this.executorService = getDefaultExecutorService();
+        this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
         TIMES_INSTANTIATED.incrementAndGet();
     }
 
@@ -249,6 +260,7 @@ public class TikaConfig {
             this.detector = getDefaultDetector(mimeTypes, serviceLoader);
             this.translator = getDefaultTranslator(serviceLoader);
             this.executorService = getDefaultExecutorService();
+            this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
         } else {
             ServiceLoader tmpServiceLoader = new ServiceLoader();
             try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) {
@@ -259,7 +271,8 @@ public class TikaConfig {
                 EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader();
                 TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
                 ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
-                
+                MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader();
+
                 this.mimeTypes = typesFromDomElement(element);
                 this.encodingDetector = encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
 
@@ -269,6 +282,7 @@ public class TikaConfig {
                 this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader);
                 this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader);
                 this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader);
+                this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader);
             } catch (SAXException e) {
                 throw new TikaException(
                         "Specified Tika configuration has syntax errors: "
@@ -393,6 +407,9 @@ public class TikaConfig {
         return serviceLoader;
     }
 
+    public MetadataFilter getMetadataFilter() {
+        return metadataFilter;
+    }
     /**
      * Provides a default configuration (TikaConfig).  Currently creates a
      * new instance each time it's called; we may be able to have it
@@ -1101,7 +1118,8 @@ public class TikaConfig {
         }
 
         @Override
-        CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors, MimeTypes mimeTypes, ServiceLoader loader) {
+        CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors,
+                                                  MimeTypes mimeTypes, ServiceLoader loader) {
             return new CompositeEncodingDetector(encodingDetectors);
         }
 
@@ -1142,5 +1160,91 @@ public class TikaConfig {
         }
     }
 
+    private static class MetadataFilterXmlLoader extends
+            XmlLoader<MetadataFilter, MetadataFilter> {
+
+        boolean supportsComposite() {
+            return true;
+        }
+
+        String getParentTagName() {
+            return "metadataFilters";
+        }
+
+        String getLoaderTagName() {
+            return "metadataFilter";
+        }
+
+        @Override
+        Class<? extends MetadataFilter> getLoaderClass() {
+            return MetadataFilter.class;
+        }
+
+
+        @Override
+        boolean isComposite(MetadataFilter loaded) {
+            return loaded instanceof CompositeMetadataFilter;
+        }
+
+        @Override
+        boolean isComposite(Class<? extends MetadataFilter> loadedClass) {
+            return CompositeMetadataFilter.class.isAssignableFrom(loadedClass);
+        }
+
+        @Override
+        MetadataFilter preLoadOne(Class<? extends MetadataFilter> loadedClass,
+                                    String classname, MimeTypes mimeTypes) throws TikaException {
+            // Check for classes which can't be set in config
+            // Continue with normal loading
+            return null;
+        }
+
+        @Override
+        MetadataFilter createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
+            return getDefaultMetadataFilter(loader);
+        }
+
+        //this ignores the service loader
+        @Override
+        MetadataFilter createComposite(List<MetadataFilter> loaded, MimeTypes mimeTypes, ServiceLoader loader) {
+            return new DefaultMetadataFilter(loaded);
+        }
+
+        @Override
+        MetadataFilter createComposite(Class<? extends MetadataFilter> metadataFilterClass,
+                                         List<MetadataFilter> childMetadataFilters,
+                                         Set<Class<? extends MetadataFilter>> excludeFilters,
+                                         Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
+                throws InvocationTargetException, IllegalAccessException,
+                InstantiationException {
+            MetadataFilter metadataFilter = null;
+            Constructor<? extends MetadataFilter> c;
+
+            // Try the possible default and composite detector constructors
+            if (metadataFilter == null) {
+                try {
+                    c = metadataFilterClass.getConstructor(ServiceLoader.class, Collection.class);
+                    metadataFilter = c.newInstance(loader, excludeFilters);
+                } catch (NoSuchMethodException me) {
+                    me.printStackTrace();
+                }
+            }
+            if (metadataFilter == null) {
+                try {
+                    c = metadataFilterClass.getConstructor(List.class);
+                    metadataFilter = c.newInstance(childMetadataFilters);
+                } catch (NoSuchMethodException me) {
+                    me.printStackTrace();
+                }
+            }
+
+            return metadataFilter;
+        }
+
+        @Override
+        MetadataFilter decorate(MetadataFilter created, Element element) {
+            return created; // No decoration of MetadataFilters
+        }
+    }
 
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
new file mode 100644
index 0000000..05324f2
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+/**
+ * This class clears the entire metadata object if the
+ * mime matches the mime filter.  The idea is that you might not want
+ * to store/transmit metadata for images or specific file types.
+ */
+public class ClearByMimeMetadataFilter implements MetadataFilter {
+    private final Set<String> mimes;
+
+    public ClearByMimeMetadataFilter() {
+        this(new HashSet<>());
+    }
+
+    public ClearByMimeMetadataFilter(Set<String> mimes) {
+        this.mimes = mimes;
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        String mimeString = metadata.get(Metadata.CONTENT_TYPE);
+        if (mimeString == null) {
+            return;
+        }
+        MediaType mt = MediaType.parse(mimeString);
+        if (mt != null) {
+            mimeString = mt.getBaseType().toString();
+        }
+        if (mimes.contains(mimeString)) {
+            for (String n : metadata.names()) {
+                metadata.remove(n);
+            }
+
+        }
+    }
+
+    /**
+     *
+     * @param mimesString comma-delimited list of mimes that will trigger complete removal of metadata
+     */
+    @Field
+    public void setMimes(String mimesString) {
+        for (String include : mimesString.split(",")) {
+            mimes.add(include);
+        }
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
new file mode 100644
index 0000000..4d592c9
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.List;
+
+public class CompositeMetadataFilter implements MetadataFilter {
+
+    private final List<MetadataFilter> filters;
+
+    public CompositeMetadataFilter(List<MetadataFilter> filters) {
+        this.filters = filters;
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        for (MetadataFilter filter : filters) {
+            filter.filter(metadata);
+        }
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
new file mode 100644
index 0000000..7671f50
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.mime.MimeTypes;
+import org.apache.tika.utils.ServiceLoaderUtils;
+
+import java.util.List;
+
+public class DefaultMetadataFilter extends CompositeMetadataFilter {
+
+    private static List<MetadataFilter> getDefaultFilters(
+            ServiceLoader loader) {
+        List<MetadataFilter> detectors = loader.loadStaticServiceProviders(MetadataFilter.class);
+        ServiceLoaderUtils.sortLoadedClasses(detectors);
+
+        return detectors;
+    }
+
+    public DefaultMetadataFilter(ServiceLoader serviceLoader) {
+        super(getDefaultFilters(serviceLoader));
+    }
+
+    public DefaultMetadataFilter(List<MetadataFilter> metadataFilters) {
+        super(metadataFilters);
+    }
+
+    public DefaultMetadataFilter() {
+        this(new ServiceLoader());
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
new file mode 100644
index 0000000..3b6e2a0
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.HashSet;
+import java.util.Set;
+
+public class ExcludeFieldMetadataFilter implements MetadataFilter {
+    private final Set<String> exclude;
+
+    public ExcludeFieldMetadataFilter() {
+        this(new HashSet<>());
+    }
+    public ExcludeFieldMetadataFilter(Set<String> exclude) {
+        this.exclude = exclude;
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        for (String field : exclude) {
+            metadata.remove(field);
+        }
+    }
+
+    /**
+     *
+     * @param excludeString comma-delimited list of fields to exclude
+     */
+    @Field
+    public void setExclude(String excludeString) {
+        for (String include : excludeString.split(",")) {
+            exclude.add(include);
+        }
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
new file mode 100644
index 0000000..4bc6c9e
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class IncludeFieldMetadataFilter implements MetadataFilter {
+    private final Set<String> includeSet;
+
+    public IncludeFieldMetadataFilter() {
+        this(new HashSet<>());
+    }
+
+    public IncludeFieldMetadataFilter(Set<String> fields) {
+        this.includeSet = fields;
+    }
+
+    /**
+     *
+     * @param includeString comma-delimited list of fields to include
+     */
+    @Field
+    public void setInclude(String includeString) {
+        for (String include : includeString.split(",")) {
+            includeSet.add(include);
+        }
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+
+        for (String n : metadata.names()) {
+            if (! includeSet.contains(n)) {
+                metadata.remove(n);
+            }
+        }
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
new file mode 100644
index 0000000..7a8f345
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.Serializable;
+
+/**
+ * Filters the metadata in place
+ *
+ * @since Apache Tika 1.25
+ */
+public interface MetadataFilter extends Serializable {
+
+    void filter(Metadata metadata) throws TikaException;
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
new file mode 100644
index 0000000..9cd1ec3
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+/**
+ * This filter performs no operations on the metadata
+ * and leaves it untouched.
+ */
+public class NoOpFilter implements MetadataFilter {
+
+    public static NoOpFilter NOOP_FILTER = new NoOpFilter();
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        //no op
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 408598f..50f0fb8 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -16,7 +16,10 @@
  */
 package org.apache.tika.sax;
 
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.metadata.filter.NoOpFilter;
 import org.apache.tika.utils.ParserUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -40,12 +43,13 @@ import java.util.List;
 public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler {
 
     protected final List<Metadata> metadataList = new LinkedList<>();
+    private final MetadataFilter metadataFilter;
 
     /**
      * Create a handler with no limit on the number of embedded resources
      */
     public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
-        super(contentHandlerFactory);
+        this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
     }
 
     /**
@@ -54,7 +58,13 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
      * @param maxEmbeddedResources number of embedded resources that will be parsed
      */
     public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
+        this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER);
+    }
+
+    public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources,
+                                         MetadataFilter metadataFilter) {
         super(contentHandlerFactory, maxEmbeddedResources);
+        this.metadataFilter = metadataFilter;
     }
 
     /**
@@ -79,7 +89,15 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
     public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
         super.endEmbeddedDocument(contentHandler, metadata);
         addContent(contentHandler, metadata);
-        metadataList.add(ParserUtils.cloneMetadata(metadata));
+        try {
+            metadataFilter.filter(metadata);
+        } catch (TikaException e) {
+            throw new SAXException(e);
+        }
+
+        if (metadata.size() > 0) {
+            metadataList.add(ParserUtils.cloneMetadata(metadata));
+        }
     }
 
     /**
@@ -92,8 +110,15 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe
     public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
         super.endDocument(contentHandler, metadata);
         addContent(contentHandler, metadata);
+        try {
+            metadataFilter.filter(metadata);
+        } catch (TikaException e) {
+            throw new SAXException(e);
+        }
 
-        metadataList.add(0, ParserUtils.cloneMetadata(metadata));
+        if (metadata.size() > 0) {
+            metadataList.add(0, ParserUtils.cloneMetadata(metadata));
+        }
     }
 
     /**
diff --git a/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter b/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter
new file mode 100644
index 0000000..604a480
--- /dev/null
+++ b/tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.metadata.filter.NoOpFilter
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 5c406cd..1b8722d 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -327,4 +327,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
         getConfig("TIKA-2732-xmlreaderutils-exc.xml");
     }
 
+
+
 }
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
new file mode 100644
index 0000000..0632dd4
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+
+import java.util.Locale;
+
+/**
+ * Mock Filter for testing uppercasing of all values
+ */
+public class MockUpperCaseFilter implements MetadataFilter {
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        for (String n : metadata.names()) {
+            String[] vals = metadata.getValues(n);
+            metadata.remove(n);
+            for (int i = 0; i < vals.length; i++) {
+                metadata.add(n, vals[i].toUpperCase(Locale.US));
+            }
+        }
+    }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
new file mode 100644
index 0000000..e933d0c
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.metadata.filter;
+
+import org.apache.tika.config.AbstractTikaConfigTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.junit.Test;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+
+public class TestMetadataFilter extends AbstractTikaConfigTest {
+
+    @Test
+    public void testDefault() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set("title", "title");
+        metadata.set("author", "author");
+
+        MetadataFilter defaultFilter = new DefaultMetadataFilter();
+        defaultFilter.filter(metadata);
+
+        assertEquals(2, metadata.names().length);
+        assertEquals("title", metadata.get("title"));
+        assertEquals("author", metadata.get("author"));
+    }
+
+    @Test
+    public void testIncludeFilter() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set("title", "title");
+        metadata.set("author", "author");
+
+        MetadataFilter filter = new IncludeFieldMetadataFilter(set("title"));
+        filter.filter(metadata);
+        assertEquals(1, metadata.names().length);
+        assertEquals("title", metadata.get("title"));
+        assertNull(metadata.get("author"));
+    }
+
+    @Test
+    public void testExcludeFilter() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set("title", "title");
+        metadata.set("author", "author");
+
+        MetadataFilter filter = new ExcludeFieldMetadataFilter(set("title"));
+        filter.filter(metadata);
+        assertEquals(1, metadata.names().length);
+        assertEquals("author", metadata.get("author"));
+        assertNull(metadata.get("title"));
+    }
+
+    @Test
+    public void testConfigIncludeFilter() throws Exception {
+        TikaConfig config = getConfig("TIKA-3137-include.xml");
+        Metadata metadata = new Metadata();
+        metadata.set("title", "title");
+        metadata.set("author", "author");
+        metadata.set("content", "content");
+
+        config.getMetadataFilter().filter(metadata);
+
+        assertEquals(2, metadata.size());
+        assertEquals("title", metadata.get("title"));
+        assertEquals("author", metadata.get("author"));
+    }
+
+    @Test
+    public void testConfigExcludeFilter() throws Exception {
+        TikaConfig config = getConfig("TIKA-3137-exclude.xml");
+        Metadata metadata = new Metadata();
+        metadata.set("title", "title");
+        metadata.set("author", "author");
+        metadata.set("content", "content");
+
+        config.getMetadataFilter().filter(metadata);
+
+        assertEquals(1, metadata.size());
+        assertEquals("content", metadata.get("content"));
+    }
+
+    @Test
+    public void testConfigIncludeAndUCFilter() throws Exception {
+        TikaConfig config = getConfig("TIKA-3137-include-uc.xml");
+        String[] expectedTitles = new String[]{
+                "TITLE1", "TITLE2", "TITLE3"
+        };
+        Metadata metadata = new Metadata();
+        metadata.add("title", "title1");
+        metadata.add("title", "title2");
+        metadata.add("title", "title3");
+        metadata.set("author", "author");
+        metadata.set("content", "content");
+
+        config.getMetadataFilter().filter(metadata);
+
+        assertEquals(2, metadata.size());
+        assertArrayEquals(expectedTitles, metadata.getValues("title"));
+        assertEquals("AUTHOR", metadata.get("author"));
+    }
+
+    @Test
+    public void testMimeClearingFilter() throws Exception {
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, MediaType.image("jpeg").toString());
+        metadata.set("author", "author");
+
+        MetadataFilter filter = new ClearByMimeMetadataFilter(set("image/jpeg","application/pdf"));
+        filter.filter(metadata);
+        assertEquals(0, metadata.size());
+
+        metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString());
+        metadata.set("author", "author");
+        filter.filter(metadata);
+        assertEquals(2, metadata.size());
+        assertEquals("author", metadata.get("author"));
+
+    }
+
+    @Test
+    public void testMimeClearingFilterConfig() throws Exception {
+        TikaConfig config = getConfig("TIKA-3137-mimes-uc.xml");
+
+        Metadata metadata = new Metadata();
+        metadata.set(Metadata.CONTENT_TYPE, MediaType.image("jpeg").toString());
+        metadata.set("author", "author");
+
+        MetadataFilter filter = config.getMetadataFilter();
+        filter.filter(metadata);
+        debug(metadata);
+        assertEquals(0, metadata.size());
+
+        metadata.set(Metadata.CONTENT_TYPE, MediaType.text("plain").toString());
+        metadata.set("author", "author");
+        filter.filter(metadata);
+        assertEquals(2, metadata.size());
+        assertEquals("AUTHOR", metadata.get("author"));
+
+    }
+
+    private static Set<String> set(String ... items) {
+        Set<String> set = new HashSet<>();
+        for (String item : items) {
+            set.add(item);
+        }
+        return set;
+    }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
new file mode 100644
index 0000000..27517f6
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-exclude.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.ExcludeFieldMetadataFilter">
+      <params>
+        <param name="exclude" type="string">title,author</param>
+      </params>
+    </metadataFilter>
+  </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
new file mode 100644
index 0000000..e0df476
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include-uc.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+      <params>
+        <param name="include" type="string">title,author</param>
+      </params>
+    </metadataFilter>
+    <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/>
+  </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
new file mode 100644
index 0000000..e92dff8
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-include.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+      <params>
+        <param name="include" type="string">title,author</param>
+      </params>
+    </metadataFilter>
+  </metadataFilters>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
new file mode 100644
index 0000000..486280c
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3137-mimes-uc.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
+      <params>
+        <param name="mimes" type="string">image/jpeg,application/pdf</param>
+      </params>
+    </metadataFilter>
+    <metadataFilter class="org.apache.tika.metadata.filter.MockUpperCaseFilter"/>
+  </metadataFilters>
+</properties>
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index a5182c6..349f271 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -21,6 +21,7 @@ package org.apache.tika.parser;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 import java.io.IOException;
 import java.io.InputStream;
@@ -30,6 +31,7 @@ import java.util.Set;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.ClosedInputStream;
 import org.apache.tika.io.ProxyInputStream;
@@ -365,6 +367,47 @@ public class RecursiveParserWrapperTest extends TikaTest {
 
     }
 
+    @Test
+    public void testIncludeFilter() throws Exception {
+        //TIKA-3137
+        ParseContext context = new ParseContext();
+        Metadata metadata = new Metadata();
+        TikaConfig tikaConfig = new TikaConfig(getClass().getResourceAsStream("TIKA-3137-include.xml"));
+        Parser p = new AutoDetectParser(tikaConfig);
+        RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, true);
+        String path = "/test-documents/test_recursive_embedded.docx";
+        ContentHandlerFactory contentHandlerFactory =
+                new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
+                        -1);
+
+        RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(contentHandlerFactory,
+                -1, tikaConfig.getMetadataFilter());
+        try (InputStream is = getClass().getResourceAsStream(path)) {
+            wrapper.parse(is, handler, metadata, context);
+        }
+        List<Metadata> metadataList = handler.getMetadataList();
+        assertEquals(5, metadataList.size());
+
+        Set<String> expectedKeys = new HashSet<>();
+        expectedKeys.add("X-TIKA:content");
+        expectedKeys.add("extended-properties:Application");
+        expectedKeys.add("Content-Type");
+        for (Metadata m : metadataList) {
+            if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) {
+                fail("emf should have been filtered out");
+            }
+            if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) {
+                fail("text/plain should have been filtered out");
+            }
+            assertTrue(m.names().length >= 2);
+            for (String n : m.names()) {
+                if (! expectedKeys.contains(n)) {
+                    fail("didn't expect "+n);
+                }
+            }
+        }
+    }
+
     private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory,
                                        boolean catchEmbeddedExceptions,
                                        DigestingParser.Digester digester) throws Exception {
diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
new file mode 100644
index 0000000..765bc11
--- /dev/null
+++ b/tika-parsers/src/test/resources/org/apache/tika/parser/TIKA-3137-include.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+      <params>
+        <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param>
+      </params>
+    </metadataFilter>
+    <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
+      <params>
+        <param name="mimes" type="string">image/emf,text/plain</param>
+      </params>
+    </metadataFilter>
+  </metadataFilters>
+</properties>
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
index 07d20c5..71e7180 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/RecursiveMetadataResource.java
@@ -152,7 +152,8 @@ public class RecursiveMetadataResource {
         BasicContentHandlerFactory.HANDLER_TYPE type =
                 BasicContentHandlerFactory.parseHandlerType(handlerTypeName, DEFAULT_HANDLER_TYPE);
 		RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
-		        new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources);
+		        new BasicContentHandlerFactory(type, writeLimit), maxEmbeddedResources,
+                TikaResource.getConfig().getMetadataFilter());
 		try {
             TikaResource.parse(wrapper, LOG, info.getPath(), is, handler, metadata, context);
         } catch (SecurityException e) {
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 92c9d34..8b5f153 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -90,7 +90,8 @@ public abstract class CXFTestBase {
 
     @Before
     public void setUp() throws Exception {
-        this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
+
+        this.tika = new TikaConfig(getTikaConfigInputStream());
         TikaResource.init(tika,
                 new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
                 new DefaultInputStreamFactory(), new ServerStatus(true));
@@ -120,6 +121,10 @@ public abstract class CXFTestBase {
         server = sf.create();
     }
 
+    protected InputStream getTikaConfigInputStream() {
+        return getClass().getResourceAsStream("tika-config-for-server-tests.xml");
+    }
+
     /**
      * Have the test do {@link JAXRSServerFactoryBean#setResourceClasses(Class...)}
      * and {@link JAXRSServerFactoryBean#setResourceProvider(Class, org.apache.cxf.jaxrs.lifecycle.ResourceProvider)}
diff --git a/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
new file mode 100644
index 0000000..748ee77
--- /dev/null
+++ b/tika-server/src/test/java/org/apache/tika/server/RecursiveMetadataFilterTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server;
+
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.ext.multipart.Attachment;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.apache.tika.server.resource.RecursiveMetadataResource;
+import org.apache.tika.server.writer.MetadataListMessageBodyWriter;
+import org.junit.Test;
+
+import javax.ws.rs.core.Response;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.tika.TikaTest.assertNotContained;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+public class RecursiveMetadataFilterTest extends CXFTestBase {
+
+    private static final String META_PATH = "/rmeta";
+
+    private static final String TEST_RECURSIVE_DOC = "test_recursive_embedded.docx";
+
+    @Override
+    protected InputStream getTikaConfigInputStream() {
+        return getClass().getResourceAsStream("TIKA-3137-include.xml");
+    }
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        sf.setResourceClasses(RecursiveMetadataResource.class);
+        sf.setResourceProvider(RecursiveMetadataResource.class,
+                new SingletonResourceProvider(new RecursiveMetadataResource()));
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new MetadataListMessageBodyWriter());
+        sf.setProviders(providers);
+    }
+
+    @Test
+    public void testBasicFilter() throws Exception {
+        Response response = WebClient
+                .create(endPoint + META_PATH)
+                .accept("application/json")
+                .acceptEncoding("gzip")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader(new GzipCompressorInputStream((InputStream) response.getEntity()), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+        assertEquals(5, metadataList.size());
+
+        Set<String> expectedKeys = new HashSet<>();
+        expectedKeys.add("X-TIKA:content");
+        expectedKeys.add("extended-properties:Application");
+        expectedKeys.add("Content-Type");
+        for (Metadata m : metadataList) {
+            if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) {
+                fail("emf should have been filtered out");
+            }
+            if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) {
+                fail("text/plain should have been filtered out");
+            }
+            assertTrue(m.names().length >= 2);
+            for (String n : m.names()) {
+                if (! expectedKeys.contains(n)) {
+                    fail("didn't expect "+n);
+                }
+            }
+        }
+    }
+}
diff --git a/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
new file mode 100644
index 0000000..765bc11
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/server/TIKA-3137-include.xml
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <metadataFilters>
+    <metadataFilter class="org.apache.tika.metadata.filter.IncludeFieldMetadataFilter">
+      <params>
+        <param name="include" type="string">X-TIKA:content,extended-properties:Application,Content-Type</param>
+      </params>
+    </metadataFilter>
+    <metadataFilter class="org.apache.tika.metadata.filter.ClearByMimeMetadataFilter">
+      <params>
+        <param name="mimes" type="string">image/emf,text/plain</param>
+      </params>
+    </metadataFilter>
+  </metadataFilters>
+</properties>


[tika] 03/06: TIKA-3140 -- initial commit

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 3388d28f0276b50ba5accb2fe9daad3cc2152d6d
Author: tallison <ta...@apache.org>
AuthorDate: Thu Jul 16 17:27:06 2020 -0400

    TIKA-3140 -- initial commit
---
 .../tika/eval/metadata/TikaEvalMetadataFilter.java | 104 +++++++++++++++++++++
 .../eval/metadata/TikaEvalMetadataFilterTest.java  |  51 ++++++++++
 2 files changed, 155 insertions(+)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
new file mode 100644
index 0000000..2c69801
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilter.java
@@ -0,0 +1,104 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.metadata;
+
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.eval.langid.Language;
+import org.apache.tika.eval.langid.LanguageIDWrapper;
+import org.apache.tika.eval.textstats.BasicTokenCountStatsCalculator;
+import org.apache.tika.eval.textstats.CommonTokens;
+import org.apache.tika.eval.textstats.CompositeTextStatsCalculator;
+import org.apache.tika.eval.textstats.TextStatsCalculator;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+public class TikaEvalMetadataFilter implements MetadataFilter {
+
+    public static String TIKA_EVAL_NS = "tika-eval"+ TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+
+    public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numTokens");
+
+    public static Property NUM_UNIQUE_TOKENS =
+            Property.externalInteger(TIKA_EVAL_NS+"numUniqueTokens");
+
+    public static Property NUM_ALPHA_TOKENS = Property.externalInteger(TIKA_EVAL_NS+"numAlphaTokens");
+
+    public static Property NUM_UNIQUE_ALPHA_TOKENS =
+            Property.externalInteger(TIKA_EVAL_NS+"numUniqueAlphaTokens");
+
+    public static Property LANGUAGE = Property.externalText(TIKA_EVAL_NS+"lang");
+
+    public static Property LANGUAGE_CONFIDENCE = Property.externalReal(TIKA_EVAL_NS+"langConfidence");
+
+    public static Property OUT_OF_VOCABULARY = Property.externalReal(TIKA_EVAL_NS+"oov");
+
+
+    static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR;
+    static {
+        List<TextStatsCalculator> calcs = new ArrayList<>();
+        calcs.add(new BasicTokenCountStatsCalculator());
+        calcs.add(new CommonTokens());
+        TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs);
+    }
+
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        String content = metadata.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT);
+        if (StringUtils.isAllBlank(content)) {
+            return;
+        }
+        calcStats(content, metadata);
+    }
+
+    private void calcStats(String content, Metadata metadata) {
+        Map<Class, Object> results = TEXT_STATS_CALCULATOR.calculate(content);
+
+        TokenCounts tokenCounts = (TokenCounts)results.get(BasicTokenCountStatsCalculator.class);
+        metadata.set(NUM_TOKENS, tokenCounts.getTotalTokens());
+        metadata.set(NUM_UNIQUE_TOKENS, tokenCounts.getTotalUniqueTokens());
+
+
+        //common token results
+        CommonTokenResult commonTokenResult = (CommonTokenResult)results.get(CommonTokens.class);
+        metadata.set(NUM_ALPHA_TOKENS, commonTokenResult.getAlphabeticTokens());
+        metadata.set(NUM_UNIQUE_ALPHA_TOKENS, commonTokenResult.getUniqueAlphabeticTokens());
+        if (commonTokenResult.getAlphabeticTokens() > 0) {
+            metadata.set(OUT_OF_VOCABULARY, commonTokenResult.getOOV());
+        } else {
+            metadata.set(OUT_OF_VOCABULARY, -1.0f);
+        }
+
+        //languages
+        List<Language> probabilities = (List<Language>) results.get(LanguageIDWrapper.class);
+        if (probabilities.size() > 0) {
+            metadata.set(LANGUAGE, probabilities.get(0).getLanguage());
+            metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getConfidence());
+        }
+    }
+
+}
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
new file mode 100644
index 0000000..1b3d006
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/metadata/TikaEvalMetadataFilterTest.java
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.metadata;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
+import org.junit.Test;
+
+import static org.junit.Assert.assertEquals;
+
+public class TikaEvalMetadataFilterTest {
+
+    @Test
+    public void testBasic() throws Exception {
+        Metadata metadata = new Metadata();
+        String content = "the quick brown fox, Zothro 1234 1235, jumped over the lazy dog";
+        metadata.set(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT, content);
+        TikaEvalMetadataFilter filter = new TikaEvalMetadataFilter();
+        filter.filter(metadata);
+        assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
+        assertEquals(12, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_TOKENS));
+        assertEquals(11, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_TOKENS));
+        assertEquals(10, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_ALPHA_TOKENS));
+        assertEquals(9, (int)metadata.getInt(TikaEvalMetadataFilter.NUM_UNIQUE_ALPHA_TOKENS));
+
+
+        assertEquals(0.0999,
+                Double.parseDouble(metadata.get(TikaEvalMetadataFilter.OUT_OF_VOCABULARY)),
+                0.1);
+        assertEquals("eng", metadata.get(TikaEvalMetadataFilter.LANGUAGE));
+
+        assertEquals(0.0196,
+                Double.parseDouble(metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)),
+                0.1);
+
+    }
+}