You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/17 15:34:41 UTC
[tika] branch TIKA-3695 updated: TIKA-3695 -- add fields example and test
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch TIKA-3695
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-3695 by this push:
new fd5dec5 TIKA-3695 -- add fields example and test
fd5dec5 is described below
commit fd5dec59fe458db78a7c3acd789f92ce85ad72cc
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 17 11:31:08 2022 -0400
TIKA-3695 -- add fields example and test
---
.../tika/metadata/StandardWriteFilterFactory.java | 3 +-
.../org/apache/tika/config/TikaConfigTest.java | 35 +++++++++++++++++++-
.../org/apache/tika/config/TIKA-3695-fields.xml | 37 ++++++++++++++++++++++
3 files changed, 73 insertions(+), 2 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
index 43c623c..3c9e6a1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
@@ -17,6 +17,7 @@
package org.apache.tika.metadata;
import java.util.Collections;
+import java.util.List;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@@ -31,7 +32,7 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
return new StandardWriteFilter(maxEstimatedBytes, includeFields, includeEmpty);
}
- public void setIncludeFields(Set<String> includeFields) {
+ public void setIncludeFields(List<String> includeFields) {
Set<String> keys = ConcurrentHashMap.newKeySet(includeFields.size());
keys.addAll(includeFields);
this.includeFields = Collections.unmodifiableSet(keys);
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 2642c55..da643f8 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -412,7 +412,7 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
"<mock>";
- for (int i = 0; i < 10; i++) {
+ for (int i = 0; i < 20; i++) {
mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
}
mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
@@ -430,4 +430,37 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
}
+
+ @Test
+ public void testMetadataFactoryFieldsConfig() throws Exception {
+ TikaConfig tikaConfig =
+ new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
+ AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+ MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+ assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+ "<mock>";
+ mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
+ mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
+ for (int i = 0; i < 20; i++) {
+ mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+ }
+ mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+ mock += "</mock>";
+ Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
+ getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser, metadata, new ParseContext(), true);
+ assertEquals(1, metadataList.size());
+ metadata = metadataList.get(0);
+
+ String[] creators = metadata.getValues("dc:creator");
+ assertNull(metadata.get("dc:subject"));
+ //this gets more than the other test because this is filtering out X-TIKA:Parsed-By"
+ assertEquals(12, creators.length);
+ assertEquals("012345", creators[11]);
+ assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+ assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+ }
}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
new file mode 100644
index 0000000..26e6fae
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>12345</spoolToDisk>
+ <outputThreshold>6789</outputThreshold>
+ </params>
+ <metadataWriteFilterFactory class="org.apache.tika.metadata.StandardWriteFilterFactory">
+ <params>
+ <maxEstimatedBytes>241</maxEstimatedBytes>
+ <includeFields>
+ <field>dc:creator</field>
+ <field>dc:title</field>
+ </includeFields>
+ </params>
+ </metadataWriteFilterFactory>
+ </autoDetectParserConfig>
+</properties>