You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/17 15:34:41 UTC

[tika] branch TIKA-3695 updated: TIKA-3695 -- add fields example and test

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3695
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3695 by this push:
     new fd5dec5  TIKA-3695 -- add fields example and test
fd5dec5 is described below

commit fd5dec59fe458db78a7c3acd789f92ce85ad72cc
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 17 11:31:08 2022 -0400

    TIKA-3695 -- add fields example and test
---
 .../tika/metadata/StandardWriteFilterFactory.java  |  3 +-
 .../org/apache/tika/config/TikaConfigTest.java     | 35 +++++++++++++++++++-
 .../org/apache/tika/config/TIKA-3695-fields.xml    | 37 ++++++++++++++++++++++
 3 files changed, 73 insertions(+), 2 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
index 43c623c..3c9e6a1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
@@ -17,6 +17,7 @@
 package org.apache.tika.metadata;
 
 import java.util.Collections;
+import java.util.List;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 
@@ -31,7 +32,7 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
         return new StandardWriteFilter(maxEstimatedBytes, includeFields, includeEmpty);
     }
 
-    public void setIncludeFields(Set<String> includeFields) {
+    public void setIncludeFields(List<String> includeFields) {
         Set<String> keys = ConcurrentHashMap.newKeySet(includeFields.size());
         keys.addAll(includeFields);
         this.includeFields = Collections.unmodifiableSet(keys);
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 2642c55..da643f8 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -412,7 +412,7 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
         AutoDetectParser parser = new AutoDetectParser(tikaConfig);
         String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
                 "<mock>";
-        for (int i = 0; i < 10; i++) {
+        for (int i = 0; i < 20; i++) {
             mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
         }
         mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
@@ -430,4 +430,37 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
         assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
         assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
     }
+
+    @Test
+    public void testMetadataFactoryFieldsConfig() throws Exception {
+        TikaConfig tikaConfig =
+                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
+        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+                "<mock>";
+        mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
+        mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
+        for (int i = 0; i < 20; i++) {
+            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+        }
+        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+        mock += "</mock>";
+        Metadata metadata = new Metadata();
+        List<Metadata> metadataList =
+                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+                        parser, metadata, new ParseContext(), true);
+        assertEquals(1, metadataList.size());
+        metadata = metadataList.get(0);
+
+        String[] creators = metadata.getValues("dc:creator");
+        assertNull(metadata.get("dc:subject"));
+        //this gets more than the other test because this is filtering out X-TIKA:Parsed-By"
+        assertEquals(12, creators.length);
+        assertEquals("012345", creators[11]);
+        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+    }
 }
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
new file mode 100644
index 0000000..26e6fae
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>12345</spoolToDisk>
+      <outputThreshold>6789</outputThreshold>
+    </params>
+    <metadataWriteFilterFactory class="org.apache.tika.metadata.StandardWriteFilterFactory">
+      <params>
+        <maxEstimatedBytes>241</maxEstimatedBytes>
+        <includeFields>
+          <field>dc:creator</field>
+          <field>dc:title</field>
+        </includeFields>
+      </params>
+    </metadataWriteFilterFactory>
+  </autoDetectParserConfig>
+</properties>