You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/04/08 17:34:48 UTC
[tika] 01/02: TIKA-3717 -- add metadata filters for optimaize and opennlp lang detectors
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
commit fa6c4baac4502f0bbda7f92d71a69493d9805399
Author: tallison <ta...@apache.org>
AuthorDate: Fri Apr 8 12:37:29 2022 -0400
TIKA-3717 -- add metadata filters for optimaize and opennlp lang detectors
---
CHANGES.txt | 2 +
.../java/org/apache/tika/metadata/Metadata.java | 4 -
.../apache/tika/metadata/TikaCoreProperties.java | 9 ++
.../metadatafilter/OpenNLPMetadataFilter.java | 49 ++++++++++
.../optimaize/OptimaizeLangDetector.java | 13 ++-
.../metadatafilter/OptimaizeMetadataFilter.java | 49 ++++++++++
tika-server/tika-server-standard/pom.xml | 22 +++--
.../server/standard/OpenNLPMetadataFilterTest.java | 107 +++++++++++++++++++++
.../standard/OptimaizeMetadataFilterTest.java | 106 ++++++++++++++++++++
.../tika-config-langdetect-opennlp-filter.xml | 32 ++++++
.../tika-config-langdetect-optimaize-filter.xml | 32 ++++++
11 files changed, 408 insertions(+), 17 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 1a4b552f7..7472cbed1 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -27,6 +27,8 @@ Release 2.4.0 - ???
* Add a Parsed-By-Full-Set metadata item to record all parsers that processed
a file (TIKA-3716).
+ * Add metadata filters for Optimaize and OpenNLP language detectors (TIKA-3717).
+
* Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
* Various dependency upgrades, including POI, dl4j, gson, jackson,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 033c7aaf4..98de6d282 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -536,10 +536,6 @@ public class Metadata
* @since Apache Tika 0.8
*/
public void set(Property property, double value) {
- if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
- throw new PropertyTypeException(Property.PropertyType.SIMPLE,
- property.getPrimaryProperty().getPropertyType());
- }
if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL &&
property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) {
throw new PropertyTypeException(Property.ValueType.REAL,
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index bf6221e35..ba138c54f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -110,6 +110,15 @@ public interface TikaCoreProperties {
*/
Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
+ Property TIKA_DETECTED_LANGUAGE = Property.externalTextBag(TIKA_META_PREFIX +
+ "detected_language");
+
+ Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = Property.externalTextBag(TIKA_META_PREFIX +
+ "detected_language_confidence");
+
+ Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = Property.externalRealSeq(TIKA_META_PREFIX +
+ "detected_language_confidence_raw");
+
String RESOURCE_NAME_KEY = "resourceName";
String PROTECTED = "protected";
String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
diff --git a/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
new file mode 100644
index 000000000..e0f88023e
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-opennlp/src/main/java/org/apache/tika/langdetect/opennlp/metadatafilter/OpenNLPMetadataFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.opennlp.metadatafilter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.langdetect.opennlp.OpenNLPDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+
+public class OpenNLPMetadataFilter extends MetadataFilter {
+
+ private int maxCharsForDetection = 10000;
+
+ @Field
+ public void setMaxCharsForDetection(int maxCharsForDetection) {
+ this.maxCharsForDetection = maxCharsForDetection;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ OpenNLPDetector detector = new OpenNLPDetector();
+ detector.setMaxLength(maxCharsForDetection);
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ if (content == null) {
+ return;
+ }
+ LanguageResult r = detector.detect(content);
+ metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE, r.getLanguage());
+ metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, r.getConfidence().name());
+ metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW, r.getRawScore());
+ }
+}
diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java
index 901597b26..a4ff3174b 100644
--- a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java
+++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/OptimaizeLangDetector.java
@@ -49,8 +49,8 @@ public class OptimaizeLangDetector extends LanguageDetector {
private static final List<LanguageProfile> DEFAULT_LANGUAGE_PROFILES;
private static final ImmutableSet<String> DEFAULT_LANGUAGES;
private static final com.optimaize.langdetect.LanguageDetector DEFAULT_DETECTOR;
- private static final int MAX_CHARS_FOR_DETECTION = 20000;
- private static final int MAX_CHARS_FOR_SHORT_DETECTION = 200;
+ public static final int DEFAULT_MAX_CHARS_FOR_DETECTION = 20000;
+ public static final int DEFAULT_MAX_CHARS_FOR_SHORT_DETECTION = 200;
static {
try {
@@ -73,11 +73,14 @@ public class OptimaizeLangDetector extends LanguageDetector {
private CharArrayWriter writer;
private Set<String> languages;
private Map<String, Float> languageProbabilities;
+ private int maxCharsForDetection = DEFAULT_MAX_CHARS_FOR_DETECTION;
public OptimaizeLangDetector() {
- super();
+ this(DEFAULT_MAX_CHARS_FOR_DETECTION);
+ }
- writer = new CharArrayWriter(MAX_CHARS_FOR_DETECTION);
+ public OptimaizeLangDetector(int maxCharsForDetection) {
+ writer = new CharArrayWriter(maxCharsForDetection);
}
private static String makeLanguageName(LdLocale locale) {
@@ -220,7 +223,7 @@ public class OptimaizeLangDetector extends LanguageDetector {
}
private int getTextLimit() {
- int limit = (shortText ? MAX_CHARS_FOR_SHORT_DETECTION : MAX_CHARS_FOR_DETECTION);
+ int limit = (shortText ? DEFAULT_MAX_CHARS_FOR_SHORT_DETECTION : maxCharsForDetection);
// We want more text if we're processing documents that have a mixture of languages.
// FUTURE - figure out right amount to bump up the limit.
diff --git a/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
new file mode 100644
index 000000000..a0e3dd6c7
--- /dev/null
+++ b/tika-langdetect/tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.optimaize.metadatafilter;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.langdetect.optimaize.OptimaizeLangDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.filter.MetadataFilter;
+
+public class OptimaizeMetadataFilter extends MetadataFilter {
+
+ private int maxCharsForDetection = OptimaizeLangDetector.DEFAULT_MAX_CHARS_FOR_DETECTION;
+
+ @Field
+ public void setMaxCharsForDetection(int maxCharsForDetection) {
+ this.maxCharsForDetection = maxCharsForDetection;
+ }
+
+ @Override
+ public void filter(Metadata metadata) throws TikaException {
+ OptimaizeLangDetector detector = new OptimaizeLangDetector(maxCharsForDetection);
+ detector.loadModels();
+ String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
+ if (content == null) {
+ return;
+ }
+ LanguageResult r = detector.detect(content);
+ metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE, r.getLanguage());
+ metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE, r.getConfidence().name());
+ metadata.set(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW, r.getRawScore());
+ }
+}
diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml
index 0f245109b..cd41bd083 100644
--- a/tika-server/tika-server-standard/pom.xml
+++ b/tika-server/tika-server-standard/pom.xml
@@ -59,6 +59,16 @@
<artifactId>tika-emitter-fs</artifactId>
<version>${project.version}</version>
</dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-core</artifactId>
+ <version>${log4j2.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j-impl</artifactId>
+ <version>${log4j2.version}</version>
+ </dependency>
<!-- test jars -->
<dependency>
<groupId>${project.groupId}</groupId>
@@ -75,14 +85,10 @@
<scope>test</scope>
</dependency>
<dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-core</artifactId>
- <version>${log4j2.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j-impl</artifactId>
- <version>${log4j2.version}</version>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-langdetect-opennlp</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
</dependency>
</dependencies>
<build>
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OpenNLPMetadataFilterTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OpenNLPMetadataFilterTest.java
new file mode 100644
index 000000000..ca81e5df4
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OpenNLPMetadataFilterTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.standard;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.resource.RecursiveMetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
+import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+
+public class OpenNLPMetadataFilterTest extends CXFTestBase {
+
+ private static final String TIKA_PATH = "/tika";
+ private static final String META_PATH = "/rmeta";
+ private static final String TEST_RECURSIVE_DOC = "test-documents/test_recursive_embedded.docx";
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(RecursiveMetadataResource.class, TikaResource.class);
+ sf.setResourceProvider(RecursiveMetadataResource.class,
+ new SingletonResourceProvider(new RecursiveMetadataResource()));
+ sf.setResourceProvider(TikaResource.class,
+ new SingletonResourceProvider(new TikaResource()));
+
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new MetadataListMessageBodyWriter());
+ providers.add(new JSONMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return getClass().getResourceAsStream(
+ "/config/tika-config-langdetect-opennlp-filter.xml");
+ }
+
+ @Test
+ public void testMeta() throws Exception {
+ Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+
+ assertEquals(12, metadataList.size());
+ assertEquals("Microsoft Office Word",
+ metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+ assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+ assertEquals("a38e6c7b38541af87148dee9634cb811",
+ metadataList.get(10).get("X-TIKA:digest:MD5"));
+
+ assertEquals("eng", metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+ assertEquals("LOW",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+ }
+
+ @Test
+ public void testTika() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ Metadata metadata = JsonMetadata.fromJson(reader);
+ assertEquals("eng", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+ assertEquals("LOW", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+ }
+}
diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OptimaizeMetadataFilterTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OptimaizeMetadataFilterTest.java
new file mode 100644
index 000000000..6dba4a425
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/OptimaizeMetadataFilterTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.server.standard;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import javax.ws.rs.core.Response;
+
+import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
+import org.apache.cxf.jaxrs.client.WebClient;
+import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.OfficeOpenXMLExtended;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.metadata.serialization.JsonMetadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.server.core.CXFTestBase;
+import org.apache.tika.server.core.resource.RecursiveMetadataResource;
+import org.apache.tika.server.core.resource.TikaResource;
+import org.apache.tika.server.core.writer.JSONMessageBodyWriter;
+import org.apache.tika.server.core.writer.MetadataListMessageBodyWriter;
+
+public class OptimaizeMetadataFilterTest extends CXFTestBase {
+
+ private static final String TIKA_PATH = "/tika";
+ private static final String META_PATH = "/rmeta";
+ private static final String TEST_RECURSIVE_DOC = "test-documents/test_recursive_embedded.docx";
+
+ @Override
+ protected void setUpResources(JAXRSServerFactoryBean sf) {
+ sf.setResourceClasses(RecursiveMetadataResource.class, TikaResource.class);
+ sf.setResourceProvider(RecursiveMetadataResource.class,
+ new SingletonResourceProvider(new RecursiveMetadataResource()));
+ sf.setResourceProvider(TikaResource.class,
+ new SingletonResourceProvider(new TikaResource()));
+
+ }
+
+ @Override
+ protected void setUpProviders(JAXRSServerFactoryBean sf) {
+ List<Object> providers = new ArrayList<>();
+ providers.add(new MetadataListMessageBodyWriter());
+ providers.add(new JSONMessageBodyWriter());
+ sf.setProviders(providers);
+ }
+
+ @Override
+ protected InputStream getTikaConfigInputStream() {
+ return getClass().getResourceAsStream("/config/tika-config-langdetect-optimaize-filter.xml");
+ }
+
+ @Test
+ public void testMeta() throws Exception {
+ Response response = WebClient.create(endPoint + META_PATH).accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ List<Metadata> metadataList = JsonMetadataList.fromJson(reader);
+
+ assertEquals(12, metadataList.size());
+ assertEquals("Microsoft Office Word",
+ metadataList.get(0).get(OfficeOpenXMLExtended.APPLICATION));
+ assertContains("plundered our seas", metadataList.get(6).get("X-TIKA:content"));
+
+ assertEquals("a38e6c7b38541af87148dee9634cb811",
+ metadataList.get(10).get("X-TIKA:digest:MD5"));
+
+ assertEquals("en", metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+ assertEquals("HIGH",
+ metadataList.get(6).get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+ }
+
+ @Test
+ public void testTika() throws Exception {
+ Response response = WebClient.create(endPoint + TIKA_PATH).accept("application/json")
+ .put(ClassLoader.getSystemResourceAsStream(TEST_RECURSIVE_DOC));
+
+ Reader reader = new InputStreamReader((InputStream) response.getEntity(), UTF_8);
+ Metadata metadata = JsonMetadata.fromJson(reader);
+ assertEquals("en", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE));
+ assertEquals("HIGH", metadata.get(TikaCoreProperties.TIKA_DETECTED_LANGUAGE_CONFIDENCE));
+ }
+}
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-opennlp-filter.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-opennlp-filter.xml
new file mode 100644
index 000000000..8448e42bf
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-opennlp-filter.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.langdetect.opennlp.metadatafilter.OpenNLPMetadataFilter"/>
+ </metadataFilters>
+</properties>
\ No newline at end of file
diff --git a/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-optimaize-filter.xml b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-optimaize-filter.xml
new file mode 100644
index 000000000..82c623b24
--- /dev/null
+++ b/tika-server/tika-server-standard/src/test/resources/config/tika-config-langdetect-optimaize-filter.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+ <metadataFilters>
+ <metadataFilter class="org.apache.tika.langdetect.optimaize.metadatafilter.OptimaizeMetadataFilter"/>
+ </metadataFilters>
+</properties>
\ No newline at end of file