You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/08 17:34:48 UTC

[tika] 01/02: TIKA-3717 -- add metadata filters for optimaize and opennlp lang detectors

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit fa6c4baac4502f0bbda7f92d71a69493d9805399
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 8 12:37:29 2022 -0400

    TIKA-3717 -- add metadata filters for optimaize and opennlp lang detectors
---
 CHANGES.txt                                        |   2 +
 .../java/org/apache/tika/metadata/Metadata.java    |   4 -
 .../apache/tika/metadata/TikaCoreProperties.java   |   9 ++
 .../metadatafilter/OpenNLPMetadataFilter.java      |  49 ++++++++++
 .../optimaize/OptimaizeLangDetector.java           |  13 ++-
 .../metadatafilter/OptimaizeMetadataFilter.java    |  49 ++++++++++
 tika-server/tika-server-standard/pom.xml           |  22 +++--
 .../server/standard/OpenNLPMetadataFilterTest.java | 107 +++++++++++++++++++++
 .../standard/OptimaizeMetadataFilterTest.java      | 106 ++++++++++++++++++++
 .../tika-config-langdetect-opennlp-filter.xml      |  32 ++++++
 .../tika-config-langdetect-optimaize-filter.xml    |  32 ++++++
 11 files changed, 408 insertions(+), 17 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 1a4b552f7..7472cbed1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -27,6 +27,8 @@ Release 2.4.0 - ???
    * Add a Parsed-By-Full-Set metadata item to record all parsers that processed
      a file (TIKA-3716).
 
+   * Add metadata filters for Optimaize and OpenNLP language detectors (TIKA-3717).
+
    * Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
 
    * Various dependency upgrades, including POI, dl4j, gson, jackson,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 033c7aaf4..98de6d282 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -536,10 +536,6 @@ public class Metadata
      * @since Apache Tika 0.8
      */
     public void set(Property property, double value) {
-        if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
-            throw new PropertyTypeException(Property.PropertyType.SIMPLE,
-                    property.getPrimaryProperty().getPropertyType());
-        }
         if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL &&
                 property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) {
             throw new PropertyTypeException(Property.ValueType.REAL,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index bf6221e35..ba138c54f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -110,6 +110,15 @@ public interface TikaCoreProperties {
      */
     Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
 
+    Property TIKA_DETECTED_LANGUAGE = Property.externalTextBag(TIKA_META_PREFIX +
+            "detected_language");
+
+    Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = Property.externalTextBag(TIKA_META_PREFIX +
+            "detected_language_confidence");
+
+    Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = Property.externalRealSeq(TIKA_META_PREFIX +
+            "detected_language_confidence_raw");
+
     String RESOURCE_NAME_KEY = "resourceName";
     String PROTECTED = "protected";
     String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
new file mode 100644
index 000000000..e0f88023e
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.opennlp.metadatafilter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.langdetect.opennlp.OpenNLPDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+
+public class OpenNLPMetadataFilter extends MetadataFilter {
+
+    private int maxCharsForDetection = 10000;
+
+    @Field
+    public void setMaxCharsForDetection(int maxCharsForDetection) {
+        this.maxCharsForDetection = maxCharsForDetection;
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        OpenNLPDetector detector = new OpenNLPDetector();
+        detector.setMaxLength(maxCharsForDetection);
+        String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+        if (content == null) {
+            return;
+        }
+        LanguageResult r = detector.detect(content);
+        metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE, r.getLanguage());
+        metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, r.getConfidence().name());
+        metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW, r.getRawScore());
+    }
+}
diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java
index 901597b26..a4ff3174b 100644
--- a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java
+++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java
@@ -49,8 +49,8 @@ public class OptimaizeLangDetector extends LanguageDetector {
     private static final List<LanguageProfile> DEFAULT_LANGUAGE_PROFILES;
     private static final ImmutableSet<String> DEFAULT_LANGUAGES;
     private static final com.optimaize.langdetect.LanguageDetector DEFAULT_DETECTOR;
-    private static final int MAX_CHARS_FOR_DETECTION = 20000;
-    private static final int MAX_CHARS_FOR_SHORT_DETECTION = 200;
+    public static final int DEFAULT_MAX_CHARS_FOR_DETECTION = 20000;
+    public static final int DEFAULT_MAX_CHARS_FOR_SHORT_DETECTION = 200;
 
     static {
         try {
@@ -73,11 +73,14 @@ public class OptimaizeLangDetector extends LanguageDetector {
     private CharArrayWriter writer;
     private Set<String> languages;
     private Map<String, Float> languageProbabilities;
+    private int maxCharsForDetection = DEFAULT_MAX_CHARS_FOR_DETECTION;
 
     public OptimaizeLangDetector() {
-        super();
+        this(DEFAULT_MAX_CHARS_FOR_DETECTION);
+    }
 
-        writer = new CharArrayWriter(MAX_CHARS_FOR_DETECTION);
+    public OptimaizeLangDetector(int maxCharsForDetection) {
+        writer = new CharArrayWriter(maxCharsForDetection);
     }
 
     private static String makeLanguageName(LdLocale locale) {
@@ -220,7 +223,7 @@ public class OptimaizeLangDetector extends LanguageDetector {
     }
 
     private int getTextLimit() {
-        int limit = (shortText ? MAX_CHARS_FOR_SHORT_DETECTION : MAX_CHARS_FOR_DETECTION);
+        int limit = (shortText ? DEFAULT_MAX_CHARS_FOR_SHORT_DETECTION : maxCharsForDetection);
 
         // We want more text if we're processing documents that have a mixture of languages.
         // FUTURE - figure out right amount to bump up the limit.
diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
new file mode 100644
index 000000000..a0e3dd6c7
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.optimaize.metadatafilter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+
+public class OptimaizeMetadataFilter extends MetadataFilter {
+
+    private int maxCharsForDetection = OptimaizeLangDetector.DEFAULT_MAX_CHARS_FOR_DETECTION;
+
+    @Field
+    public void setMaxCharsForDetection(int maxCharsForDetection) {
+        this.maxCharsForDetection = maxCharsForDetection;
+    }
+
+    @Override
+    public void filter(Metadata metadata) throws TikaException {
+        OptimaizeLangDetector detector = new OptimaizeLangDetector(maxCharsForDetection);
+        detector.loadModels();
+        String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+        if (content == null) {
+            return;
+        }
+        LanguageResult r = detector.detect(content);
+        metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE, r.getLanguage());
+        metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, r.getConfidence().name());
+        metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW, r.getRawScore());
+    }
+}
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index 0f245109b..cd41bd083 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -59,6 +59,16 @@
       <artifactId>tika-emitter-fs</artifactId>
       <version>${project.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-core</artifactId>
+      <version>${log4j2.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.logging.log4j</groupId>
+      <artifactId>log4j-slf4j-impl</artifactId>
+      <version>${log4j2.version}</version>
+    </dependency>
     <!-- test jars -->
     <dependency>
       <groupId>${project.groupId}</groupId>
@@ -75,14 +85,10 @@
       <scope>test</scope>
     </dependency>
     <dependency>
-      <groupId>org.apache.logging.log4j</groupId>
-      <artifactId>log4j-core</artifactId>
-      <version>${log4j2.version}</version>
-    </dependency>
-    <dependency>
-      <groupId>org.apache.logging.log4j</groupId>
-      <artifactId>log4j-slf4j-impl</artifactId>
-      <version>${log4j2.version}</version>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-langdetect-opennlp</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
     </dependency>
   </dependencies>
   <build>
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OpenNLPMetadataFilterTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OpenNLPMetadataFilterTest.java
new file mode 100644
index 000000000..ca81e5df4
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OpenNLPMetadataFilterTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.standard;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.resource.RecursiveMetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
+import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+
+public class OpenNLPMetadataFilterTest extends CXFTestBase {
+
+    private static final String TIKA_PATH = "/tika";
+    private static final String META_PATH = "/rmeta";
+    private static final String TEST_RECURSIVE_DOC = "test-documents/test_recursive_embedded.docx";
+
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        sf.setResourceClasses(RecursiveMetadataResource.class, TikaResource.class);
+        sf.setResourceProvider(RecursiveMetadataResource.class,
+                new SingletonResourceProvider(new RecursiveMetadataResource()));
+        sf.setResourceProvider(TikaResource.class,
+                new SingletonResourceProvider(new TikaResource()));
+
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new MetadataListMessageBodyWriter());
+        providers.add(new JSONMessageBodyWriter());
+        sf.setProviders(providers);
+    }
+
+    @Override
+    protected InputStream getTikaConfigInputStream() {
+        return getClass().getResourceAsStream(
+                "/config/tika-config-langdetect-opennlp-filter.xml");
+    }
+
+    @Test
+    public void testMeta() throws Exception {
+        Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+
+        assertEquals(12, metadataList.size());
+        assertEquals("Microsoft Office Word",
+                metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+        assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+        assertEquals("a38e6c7b38541af87148dee9634cb811",
+                metadataList.get(10).get("X-TIKA:digest:MD5"));
+
+        assertEquals("eng", metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+        assertEquals("LOW",
+                metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+    }
+
+    @Test
+    public void testTika() throws Exception {
+        Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        Metadata metadata = JsonMetadata.fromJson(reader);
+        assertEquals("eng", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+        assertEquals("LOW", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+    }
+}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OptimaizeMetadataFilterTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OptimaizeMetadataFilterTest.java
new file mode 100644
index 000000000..6dba4a425
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OptimaizeMetadataFilterTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.standard;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.resource.RecursiveMetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
+import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+
+public class OptimaizeMetadataFilterTest extends CXFTestBase {
+
+    private static final String TIKA_PATH = "/tika";
+    private static final String META_PATH = "/rmeta";
+    private static final String TEST_RECURSIVE_DOC = "test-documents/test_recursive_embedded.docx";
+
+    @Override
+    protected void setUpResources(JAXRSServerFactoryBean sf) {
+        sf.setResourceClasses(RecursiveMetadataResource.class, TikaResource.class);
+        sf.setResourceProvider(RecursiveMetadataResource.class,
+                new SingletonResourceProvider(new RecursiveMetadataResource()));
+        sf.setResourceProvider(TikaResource.class,
+                new SingletonResourceProvider(new TikaResource()));
+
+    }
+
+    @Override
+    protected void setUpProviders(JAXRSServerFactoryBean sf) {
+        List<Object> providers = new ArrayList<>();
+        providers.add(new MetadataListMessageBodyWriter());
+        providers.add(new JSONMessageBodyWriter());
+        sf.setProviders(providers);
+    }
+
+    @Override
+    protected InputStream getTikaConfigInputStream() {
+        return getClass().getResourceAsStream("/config/tika-config-langdetect-optimaize-filter.xml");
+    }
+
+    @Test
+    public void testMeta() throws Exception {
+        Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+
+        assertEquals(12, metadataList.size());
+        assertEquals("Microsoft Office Word",
+                metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+        assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+        assertEquals("a38e6c7b38541af87148dee9634cb811",
+                metadataList.get(10).get("X-TIKA:digest:MD5"));
+
+        assertEquals("en", metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+        assertEquals("HIGH",
+                metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+    }
+
+    @Test
+    public void testTika() throws Exception {
+        Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+                .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+        Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+        Metadata metadata = JsonMetadata.fromJson(reader);
+        assertEquals("en", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+        assertEquals("HIGH", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+    }
+}
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-opennlp-filter.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-opennlp-filter.xml
new file mode 100644
index 000000000..8448e42bf
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-opennlp-filter.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+    <metadataFilters>
+        <metadataFilter class="org.apache.tika.langdetect.opennlp.metadatafilter.OpenNLPMetadataFilter"/>
+    </metadataFilters>
+</properties>
\ No newline at end of file
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-optimaize-filter.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-optimaize-filter.xml
new file mode 100644
index 000000000..82c623b24
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-optimaize-filter.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+    <metadataFilters>
+        <metadataFilter class="org.apache.tika.langdetect.optimaize.metadatafilter.OptimaizeMetadataFilter"/>
+    </metadataFilters>
+</properties>
\ No newline at end of file