You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/17 20:54:22 UTC

[tika] branch TIKA-3695 updated: TIKA-3695 -- implement filterExisting in StandardWriteFilter. Create standalone unit test.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3695
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-3695 by this push:
     new 279ad33  TIKA-3695 -- implement filterExisting in StandardWriteFilter.  Create standalone unit test.
279ad33 is described below

commit 279ad33908728fa584ff31e529053b617e2e9837
Author: tallison <ta...@apache.org>
AuthorDate: Thu Mar 17 16:54:08 2022 -0400

    TIKA-3695 -- implement filterExisting in StandardWriteFilter.  Create standalone unit test.
---
 CHANGES.txt                                        |   3 +
 .../java/org/apache/tika/metadata/Metadata.java    |   4 +-
 .../apache/tika/metadata/StandardWriteFilter.java  |  40 +++++++-
 .../org/apache/tika/parser/AutoDetectParser.java   |   2 +-
 .../org/apache/tika/config/TikaConfigTest.java     |  69 --------------
 .../tika/metadata/MetadataWriteFilterTest.java     | 106 +++++++++++++++++++++
 6 files changed, 152 insertions(+), 72 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index a78be3e..eea54b3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,9 @@ Release 2.4.0 - ???
      https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
      for the dependencies that must be provided at run-time (TIKA-3676).
 
+   * Add MetadataWriteFilter capability to improve memory profile in
+     Metadata objects (TIKA-3695).
+
    * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
 
    * Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 6d1bc0b..da71504 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -60,6 +60,7 @@ public class Metadata
      */
     private Map<String, String[]> metadata = null;
 
+    //TODO: transient?
     private MetadataWriteFilter writeFilter = ACCEPT_ALL;
     /**
      * Constructs a new, empty metadata.
@@ -144,8 +145,9 @@ public class Metadata
      * will not function properly.
      *
      * @param writeFilter
+     * @since 2.4.0
      */
-    public void setWriteFilter(MetadataWriteFilter writeFilter) {
+    public void setMetadataWriteFilter(MetadataWriteFilter writeFilter) {
         this.writeFilter = writeFilter;
         this.writeFilter.filterExisting(metadata);
     }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
index 8bc47b6..4a26ebb 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
@@ -22,7 +22,9 @@ import java.nio.CharBuffer;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
@@ -53,9 +55,16 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
     static {
         ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_LENGTH);
         ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_TYPE);
+        ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_ENCODING);
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE.getName());
         ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
         ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_HINT.getName());
         ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.TIKA_CONTENT.getName());
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.RESOURCE_NAME_KEY);
+        ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_CONTENT.getName());
+        ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY.getName());
+        ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_DISPOSITION);
+        //Metadata.CONTENT_LOCATION? used by the html parser
     }
 
     private final boolean includeEmpty;
@@ -86,7 +95,36 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
 
     @Override
     public void filterExisting(Map<String, String[]> data) {
+        //this is somewhat costly, but it ensures that
+        //metadata that was placed in the metadata object before this
+        //filter was applied is removed.
+        //It should only be called once, and probably not on that
+        //many fields.
+        Set<String> toRemove = new HashSet<>();
+        for (String n : data.keySet()) {
+            if (! includeField(n)) {
+                toRemove.add(n);
+            }
+        }
+
+        for (String n : toRemove) {
+            data.remove(n);
+        }
 
+        for (String n : data.keySet()) {
+            String[] vals = data.get(n);
+            List<String> filteredVals = new ArrayList<>();
+            for (int i = 0; i < vals.length; i++) {
+                String v = vals[i];
+                if (include(n, v)) {
+                    String filtered = filter(n, v, data);
+                    if (filtered != null) {
+                        filteredVals.add(filtered);
+                    }
+                }
+            }
+            data.put(n, filteredVals.toArray(new String[0]));
+        }
     }
 
     @Override
@@ -122,7 +160,7 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
         ByteBuffer bb = ByteBuffer.wrap(bytes, 0, available);
         CharBuffer cb = CharBuffer.allocate(available);
         CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
-        // Ignore an incomplete character
+        // Ignore last (potentially) incomplete character
         decoder.onMalformedInput(CodingErrorAction.IGNORE);
         decoder.decode(bb, cb, true);
         decoder.flush(cb);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 7b4157c..7650b23 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -130,7 +130,7 @@ public class AutoDetectParser extends CompositeParser {
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                       ParseContext context) throws IOException, SAXException, TikaException {
         if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) {
-            metadata.setWriteFilter(autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
+            metadata.setMetadataWriteFilter(autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
         }
         TemporaryResources tmp = new TemporaryResources();
         try {
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index da643f8..2f213ff 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -23,11 +23,9 @@ import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assertions.fail;
 
-import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.net.URI;
 import java.net.URL;
-import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.List;
@@ -40,9 +38,6 @@ import org.apache.tika.ResourceLoggingClassLoader;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.MetadataWriteFilterFactory;
-import org.apache.tika.metadata.StandardWriteFilterFactory;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MimeDetectionTest;
 import org.apache.tika.parser.AutoDetectParser;
@@ -51,7 +46,6 @@ import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ErrorParser;
-import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.mock.MockParser;
@@ -380,7 +374,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
         });
     }
 
-
     @Test
     public void testTimesInitiated() throws Exception {
         //this prevents multi-threading tests, but we aren't doing that now...
@@ -401,66 +394,4 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
         assertNull(config.getMaximumDepth());
         assertNull(config.getMaximumPackageEntryDepth());
     }
-
-    @Test
-    public void testMetadataFactoryConfig() throws Exception {
-        TikaConfig tikaConfig =
-                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
-        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
-        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
-        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
-        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
-        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
-                "<mock>";
-        for (int i = 0; i < 20; i++) {
-            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
-        }
-        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
-        mock += "</mock>";
-        Metadata metadata = new Metadata();
-        List<Metadata> metadataList =
-                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
-                parser, metadata, new ParseContext(), true);
-        assertEquals(1, metadataList.size());
-        metadata = metadataList.get(0);
-
-        String[] creators = metadata.getValues("dc:creator");
-        assertEquals(9, creators.length);
-        assertEquals("0123456", creators[8]);
-        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
-        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
-    }
-
-    @Test
-    public void testMetadataFactoryFieldsConfig() throws Exception {
-        TikaConfig tikaConfig =
-                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
-        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
-        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
-        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
-        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
-        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
-                "<mock>";
-        mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
-        mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
-        for (int i = 0; i < 20; i++) {
-            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
-        }
-        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
-        mock += "</mock>";
-        Metadata metadata = new Metadata();
-        List<Metadata> metadataList =
-                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
-                        parser, metadata, new ParseContext(), true);
-        assertEquals(1, metadataList.size());
-        metadata = metadataList.get(0);
-
-        String[] creators = metadata.getValues("dc:creator");
-        assertNull(metadata.get("dc:subject"));
-        //this gets more than the other test because this is filtering out X-TIKA:Parsed-By"
-        assertEquals(12, creators.length);
-        assertEquals("012345", creators[11]);
-        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
-        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
-    }
 }
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
new file mode 100644
index 0000000..54175ac
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigTest;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.AutoDetectParserConfig;
+import org.apache.tika.parser.ParseContext;
+
+public class MetadataWriteFilterTest extends TikaTest {
+
+
+    @Test
+    public void testMetadataFactoryConfig() throws Exception {
+        TikaConfig tikaConfig =
+                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
+        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+                "<mock>";
+        for (int i = 0; i < 20; i++) {
+            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+        }
+        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+        mock += "</mock>";
+        Metadata metadata = new Metadata();
+        List<Metadata> metadataList =
+                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+                        parser, metadata, new ParseContext(), true);
+        assertEquals(1, metadataList.size());
+        metadata = metadataList.get(0);
+
+        String[] creators = metadata.getValues("dc:creator");
+        assertEquals(9, creators.length);
+        assertEquals("0123456", creators[8]);
+        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+    }
+
+    @Test
+    public void testMetadataFactoryFieldsConfig() throws Exception {
+        TikaConfig tikaConfig =
+                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
+        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+                "<mock>";
+        mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
+        mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
+        for (int i = 0; i < 20; i++) {
+            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+        }
+        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+        mock += "</mock>";
+        Metadata metadata = new Metadata();
+        metadata.add("dc:creator", "abcdefghijabcdefghij");
+        metadata.add("not-allowed", "not-allowed");
+        List<Metadata> metadataList =
+                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+                        parser, metadata, new ParseContext(), true);
+        assertEquals(1, metadataList.size());
+        metadata = metadataList.get(0);
+        //test that this was removed during the filter existing stage
+        assertNull(metadata.get("not-allowed"));
+        //test that this was not allowed because it isn't in the "include" list
+        assertNull(metadata.get("dc:subject"));
+
+        String[] creators = metadata.getValues("dc:creator");
+        assertEquals("abcdefghijabcdefghij", creators[0]);
+
+        //this gets more than the other test because this is filtering out X-TIKA:Parsed-By", etc.
+        assertEquals(12, creators.length);
+        assertEquals("012345", creators[11]);
+        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+    }
+}