You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/17 21:38:05 UTC

[tika] branch main updated: TIKA-3695 (#534)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 000abdc  TIKA-3695 (#534)
000abdc is described below

commit 000abdcf70112df1a2a9a433e308c1fe5db1d45e
Author: Tim Allison <ta...@apache.org>
AuthorDate: Thu Mar 17 17:37:57 2022 -0400

    TIKA-3695 (#534)
    
    * TIKA-3695 -- initial commit. WIP do not merge.
    
    * TIKA-3695 -- add fields example and test
    
    * TIKA-3695 -- implement filterExisting in StandardWriteFilter.  Create standalone unit test.
---
 CHANGES.txt                                        |   3 +
 .../java/org/apache/tika/metadata/Metadata.java    |  94 +++++++++-
 .../apache/tika/metadata/MetadataWriteFilter.java  |  42 +++++
 .../tika/metadata/MetadataWriteFilterFactory.java  |  21 +++
 .../apache/tika/metadata/StandardWriteFilter.java  | 199 +++++++++++++++++++++
 .../tika/metadata/StandardWriteFilterFactory.java  |  58 ++++++
 .../apache/tika/metadata/TikaCoreProperties.java   |   4 +
 .../org/apache/tika/parser/AutoDetectParser.java   |   3 +
 .../apache/tika/parser/AutoDetectParserConfig.java |  11 ++
 .../org/apache/tika/config/TikaConfigTest.java     |   1 -
 .../tika/metadata/MetadataWriteFilterTest.java     | 106 +++++++++++
 .../org/apache/tika/config/TIKA-3695-fields.xml    |  37 ++++
 .../resources/org/apache/tika/config/TIKA-3695.xml |  33 ++++
 13 files changed, 607 insertions(+), 5 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index a78be3e..eea54b3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,9 @@ Release 2.4.0 - ???
      https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
      for the dependencies that must be provided at run-time (TIKA-3676).
 
+   * Add MetadataWriteFilter capability to improve memory profile in
+     Metadata objects (TIKA-3695).
+
    * Add detection for Frictionless Data packages and WACZ (TIKA-3696).
 
    * Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index bfed8ef..da71504 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -44,6 +44,8 @@ public class Metadata
         implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, TIFF,
         TikaMimeKeys, Serializable {
 
+    private static final MetadataWriteFilter ACCEPT_ALL = new AcceptAll();
+
     /**
      * Serial version UID
      */
@@ -58,6 +60,8 @@ public class Metadata
      */
     private Map<String, String[]> metadata = null;
 
+    //TODO: transient?
+    private MetadataWriteFilter writeFilter = ACCEPT_ALL;
     /**
      * Constructs a new, empty metadata.
      */
@@ -132,6 +136,23 @@ public class Metadata
     }
 
     /**
+     * Sets the writeFilter that is called before {@link #set(String, String)}
+     * {@link #set(String, String[])}, {@link #add(String, String)},
+     * {@link #add(String, String[])}.  The default is {@link #ACCEPT_ALL}.
+     *
+     * This is intended for expert use only.  Some parsers rely on metadata
+     * during the parse, and if the metadata they need is excluded, they
+     * will not function properly.
+     *
+     * @param writeFilter
+     * @since 2.4.0
+     */
+    public void setMetadataWriteFilter(MetadataWriteFilter writeFilter) {
+        this.writeFilter = writeFilter;
+        this.writeFilter.filterExisting(metadata);
+    }
+
+    /**
      * Returns the value (if any) of the identified metadata property.
      *
      * @param property property definition
@@ -236,11 +257,35 @@ public class Metadata
      * @param value the metadata value.
      */
     public void add(final String name, final String value) {
+        if (!writeFilter.include(name, value)) {
+            return;
+        }
         String[] values = metadata.get(name);
         if (values == null) {
             set(name, value);
         } else {
-            metadata.put(name, appendedValues(values, value));
+            String filtered = writeFilter.filter(name, value, metadata);
+            if (filtered != null) {
+                metadata.put(name, appendedValues(values, filtered));
+            }
+        }
+    }
+
+    /**
+     * Add a metadata name/value mapping. Add the specified value to the list of
+     * values associated to the specified metadata name.
+     *
+     * @param name  the metadata name.
+     * @param newValues the metadata values
+     */
+    protected void add(final String name, final String[] newValues) {
+        String[] values = metadata.get(name);
+        if (values == null) {
+            set(name, newValues);
+        } else {
+            for (String val : newValues) {
+                add(name, val);
+            }
         }
     }
 
@@ -270,7 +315,7 @@ public class Metadata
                 set(property, value);
             } else {
                 if (property.isMultiValuePermitted()) {
-                    set(property, appendedValues(values, value));
+                    add(property.getName(), value);
                 } else {
                     throw new PropertyTypeException(
                             property.getName() + " : " + property.getPropertyType());
@@ -303,8 +348,28 @@ public class Metadata
      * @param value the metadata value, or <code>null</code>
      */
     public void set(String name, String value) {
+        if (! writeFilter.include(name, value)) {
+            return;
+        }
         if (value != null) {
-            metadata.put(name, new String[]{value});
+            metadata.remove(name);
+            String filtered = writeFilter.filter(name, value, metadata);
+            if (filtered != null) {
+                metadata.put(name, new String[]{filtered});
+            }
+        } else {
+            metadata.remove(name);
+        }
+    }
+
+    protected void set(String name, String[] values) {
+        //TODO: optimize this to not copy if all
+        //values are to be included "as is"
+        if (values != null) {
+            metadata.remove(name);
+            for (String v : values) {
+                add(name, v);
+            }
         } else {
             metadata.remove(name);
         }
@@ -352,7 +417,7 @@ public class Metadata
                 }
             }
         } else {
-            metadata.put(property.getName(), values);
+            set(property.getName(), values);
         }
     }
 
@@ -599,4 +664,25 @@ public class Metadata
         return buf.toString();
     }
 
+    /**
+     * NO-OP write filter that accepts everything without modification.
+     */
+    private static class AcceptAll implements MetadataWriteFilter, Serializable {
+
+        @Override
+        public void filterExisting(Map<String, String[]> data) {
+            return;
+        }
+
+        @Override
+        public boolean include(String field, String value) {
+            return true;
+        }
+
+        @Override
+        public String filter(String field, String value, Map<String, String[]> data) {
+            return value;
+        }
+    }
+
 }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilter.java
new file mode 100644
index 0000000..7918a63
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilter.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.util.Map;
+
+public interface MetadataWriteFilter {
+
+    void filterExisting(Map<String, String[]> data);
+
+    boolean include(String field, String value);
+
+    /**
+     * Based on the field and value, this filter modifies the value
+     * to something that should be set or added to the Metadata object.
+     *
+     * If the value is <code>null</code>, no value is set or added.
+     *
+     * Status updates (e.g. write limit reached) can be added directly to the
+     * underlying metadata.
+     *
+     * @param field
+     * @param value
+     * @param data
+     * @return
+     */
+    String filter(String field, String value, Map<String, String[]> data);
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilterFactory.java
new file mode 100644
index 0000000..9c9c0c1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilterFactory.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+public interface MetadataWriteFilterFactory {
+    MetadataWriteFilter newInstance();
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
new file mode 100644
index 0000000..4a26ebb
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * This is to be used to limit the amount of metadata that a
+ * parser can add based on the {@link #maxEstimatedSize}. The
+ * maxEstimatedSize is measured in UTF-8 bytes.
+ *
+ * This can also be used to limit the fields that are stored
+ * in the metadata object at write-time with {@link #includeFields}.
+ *
+ * <b>NOTE:</b> Fields in {@link #ALWAYS_INCLUDE_FIELDS} are never
+ * always included, and their sizes are not included in the
+ * calculation of metadata size.
+ *
+ * <b>NOTE:</b> after the maxEstimatedSize has been hit, no
+ * further modifications to the metadata object will be allowed aside
+ * from adding/setting fields in the {@link #ALWAYS_INCLUDE_FIELDS}.
+ *
+ * <b>NOTE:</b> as with {@link Metadata}, this object is not thread safe.
+ */
+public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
+
+    public static final Set<String> ALWAYS_INCLUDE_FIELDS = new HashSet<>();
+
+    static {
+        ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_LENGTH);
+        ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_TYPE);
+        ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_ENCODING);
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE.getName());
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_HINT.getName());
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.TIKA_CONTENT.getName());
+        ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.RESOURCE_NAME_KEY);
+        ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_CONTENT.getName());
+        ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY.getName());
+        ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_DISPOSITION);
+        //Metadata.CONTENT_LOCATION? used by the html parser
+    }
+
+    private final boolean includeEmpty;
+    private final int maxEstimatedSize;
+
+    private final Set<String> includeFields;
+
+    //tracks the estimated size in utf8 bytes. Can be > maxEstimated size
+    int estimatedSize = 0;
+
+    /**
+     *
+     * @param maxEstimatedSize
+     * @param includeFields if null or empty, all fields are included; otherwise, which fields
+     *                      to add to the metadata object.
+     * @param includeEmpty if <code>true</code>, this will set or add an empty value to the
+     *                     metadata object.
+     */
+    public StandardWriteFilter(int maxEstimatedSize, Set<String> includeFields,
+                               boolean includeEmpty) {
+        if (maxEstimatedSize < 0) {
+            throw new IllegalArgumentException("max estimated size must be > 0");
+        }
+        this.maxEstimatedSize = maxEstimatedSize;
+        this.includeFields = includeFields;
+        this.includeEmpty = includeEmpty;
+    }
+
+    @Override
+    public void filterExisting(Map<String, String[]> data) {
+        //this is somewhat costly, but it ensures that
+        //metadata that was placed in the metadata object before this
+        //filter was applied is removed.
+        //It should only be called once, and probably not on that
+        //many fields.
+        Set<String> toRemove = new HashSet<>();
+        for (String n : data.keySet()) {
+            if (! includeField(n)) {
+                toRemove.add(n);
+            }
+        }
+
+        for (String n : toRemove) {
+            data.remove(n);
+        }
+
+        for (String n : data.keySet()) {
+            String[] vals = data.get(n);
+            List<String> filteredVals = new ArrayList<>();
+            for (int i = 0; i < vals.length; i++) {
+                String v = vals[i];
+                if (include(n, v)) {
+                    String filtered = filter(n, v, data);
+                    if (filtered != null) {
+                        filteredVals.add(filtered);
+                    }
+                }
+            }
+            data.put(n, filteredVals.toArray(new String[0]));
+        }
+    }
+
+    @Override
+    public boolean include(String field, String value) {
+        return includeField(field) && includeValue(value);
+    }
+
+    @Override
+    public String filter(String field, String value, Map<String, String[]> data) {
+        if (ALWAYS_INCLUDE_FIELDS.contains(field)) {
+            return value;
+        }
+        if (estimatedSize > maxEstimatedSize) {
+            return null;
+        }
+        long length = value.getBytes(StandardCharsets.UTF_8).length;
+        String toWrite = value;
+        if (estimatedSize + length > maxEstimatedSize) {
+            toWrite = truncate(value);
+            data.put(TikaCoreProperties.METADATA_LIMIT_REACHED.getName(), new String[]{"true"});
+        }
+        //this will by default bump the estimated size over what was actually written
+        //we are currently only using this as an indicator of whether to even try to write more.
+        //this value is not necessarily accurate.
+        estimatedSize += length;
+        return toWrite;
+    }
+
+    private String truncate(String value) {
+        //correctly handle multibyte characters
+        int available = maxEstimatedSize - estimatedSize;
+        byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+        ByteBuffer bb = ByteBuffer.wrap(bytes, 0, available);
+        CharBuffer cb = CharBuffer.allocate(available);
+        CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
+        // Ignore last (potentially) incomplete character
+        decoder.onMalformedInput(CodingErrorAction.IGNORE);
+        decoder.decode(bb, cb, true);
+        decoder.flush(cb);
+        return new String(cb.array(), 0, cb.position());
+    }
+
+
+    /**
+     * Tests for null or empty. Does not check for length
+     * @param value
+     * @return
+     */
+    private boolean includeValue(String value) {
+        if (includeEmpty) {
+            return true;
+        }
+        if (StringUtils.isBlank(value)) {
+            return false;
+        }
+        return true;
+    }
+
+    private boolean includeField(String name) {
+        if (ALWAYS_INCLUDE_FIELDS.contains(name)) {
+            return true;
+        }
+        if (includeFields == null) {
+            return true;
+        }
+        if (includeFields.contains(name)) {
+            return true;
+        }
+        return false;
+    }
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
new file mode 100644
index 0000000..3c9e6a1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
+
+    public static int DEFAULT_MAX_SIZE = 10 * 1024 * 1024;
+    private Set<String> includeFields = null;
+    private int maxEstimatedBytes = DEFAULT_MAX_SIZE;
+    private boolean includeEmpty = false;
+
+    public MetadataWriteFilter newInstance() {
+        return new StandardWriteFilter(maxEstimatedBytes, includeFields, includeEmpty);
+    }
+
+    public void setIncludeFields(List<String> includeFields) {
+        Set<String> keys = ConcurrentHashMap.newKeySet(includeFields.size());
+        keys.addAll(includeFields);
+        this.includeFields = Collections.unmodifiableSet(keys);
+    }
+
+    public void setMaxEstimatedBytes(int maxEstimatedBytes) {
+        this.maxEstimatedBytes = maxEstimatedBytes;
+    }
+
+    public void setIncludeEmpty(boolean includeEmpty) {
+        this.includeEmpty = includeEmpty;
+    }
+
+    @Override
+    public String toString() {
+        return "WriteFilteringMetadataFactory{" + "includeFields=" + includeFields +
+                ", maxEstimatedBytes=" + maxEstimatedBytes + ", includeEmpty=" + includeEmpty + '}';
+    }
+
+    public int getMaxEstimatedBytes() {
+        return maxEstimatedBytes;
+    }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index f6025c3..ea08f75 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -81,6 +81,10 @@ public interface TikaCoreProperties {
      */
     Property TIKA_META_EXCEPTION_WARNING =
             Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn");
+
+    Property METADATA_LIMIT_REACHED =
+            Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "metadata_limit_reached");
+
     /**
      * Use this to store exceptions caught while trying to read the
      * stream of an embedded resource.  Do not use this if there is
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 099536b..7650b23 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -129,6 +129,9 @@ public class AutoDetectParser extends CompositeParser {
 
     public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
                       ParseContext context) throws IOException, SAXException, TikaException {
+        if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) {
+            metadata.setMetadataWriteFilter(autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
+        }
         TemporaryResources tmp = new TemporaryResources();
         try {
             TikaInputStream tis = TikaInputStream.get(stream, tmp);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 32ae7c4..1115365 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -23,6 +23,7 @@ import org.w3c.dom.Element;
 
 import org.apache.tika.config.ConfigBase;
 import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.MetadataWriteFilterFactory;
 
 /**
  * This config object can be used to tune how conservative we want to be
@@ -69,6 +70,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
      */
     private Integer maximumPackageEntryDepth = null;
 
+    private MetadataWriteFilterFactory metadataWriteFilterFactory = null;
+
     /**
      *  Creates a SecureContentHandlerConfig using the passed in parameters.
      *
@@ -131,5 +134,13 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
     public void setMaximumPackageEntryDepth(int maximumPackageEntryDepth) {
         this.maximumPackageEntryDepth = maximumPackageEntryDepth;
     }
+
+    public MetadataWriteFilterFactory getMetadataWriteFilterFactory() {
+        return this.metadataWriteFilterFactory;
+    }
+
+    public void setMetadataWriteFilterFactory(MetadataWriteFilterFactory metadataWriteFilterFactory) {
+        this.metadataWriteFilterFactory = metadataWriteFilterFactory;
+    }
 }
 
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 5c8cbfd..2f213ff 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -374,7 +374,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
         });
     }
 
-
     @Test
     public void testTimesInitiated() throws Exception {
         //this prevents multi-threading tests, but we aren't doing that now...
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
new file mode 100644
index 0000000..54175ac
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigTest;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.AutoDetectParserConfig;
+import org.apache.tika.parser.ParseContext;
+
+public class MetadataWriteFilterTest extends TikaTest {
+
+
+    @Test
+    public void testMetadataFactoryConfig() throws Exception {
+        TikaConfig tikaConfig =
+                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
+        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+                "<mock>";
+        for (int i = 0; i < 20; i++) {
+            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+        }
+        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+        mock += "</mock>";
+        Metadata metadata = new Metadata();
+        List<Metadata> metadataList =
+                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+                        parser, metadata, new ParseContext(), true);
+        assertEquals(1, metadataList.size());
+        metadata = metadataList.get(0);
+
+        String[] creators = metadata.getValues("dc:creator");
+        assertEquals(9, creators.length);
+        assertEquals("0123456", creators[8]);
+        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+    }
+
+    @Test
+    public void testMetadataFactoryFieldsConfig() throws Exception {
+        TikaConfig tikaConfig =
+                new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
+        AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+        MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+        AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+        String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+                "<mock>";
+        mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
+        mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
+        for (int i = 0; i < 20; i++) {
+            mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+        }
+        mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+        mock += "</mock>";
+        Metadata metadata = new Metadata();
+        metadata.add("dc:creator", "abcdefghijabcdefghij");
+        metadata.add("not-allowed", "not-allowed");
+        List<Metadata> metadataList =
+                getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+                        parser, metadata, new ParseContext(), true);
+        assertEquals(1, metadataList.size());
+        metadata = metadataList.get(0);
+        //test that this was removed during the filter existing stage
+        assertNull(metadata.get("not-allowed"));
+        //test that this was not allowed because it isn't in the "include" list
+        assertNull(metadata.get("dc:subject"));
+
+        String[] creators = metadata.getValues("dc:creator");
+        assertEquals("abcdefghijabcdefghij", creators[0]);
+
+        //this gets more than the other test because this is filtering out X-TIKA:Parsed-By", etc.
+        assertEquals(12, creators.length);
+        assertEquals("012345", creators[11]);
+        assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+        assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+    }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
new file mode 100644
index 0000000..26e6fae
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>12345</spoolToDisk>
+      <outputThreshold>6789</outputThreshold>
+    </params>
+    <metadataWriteFilterFactory class="org.apache.tika.metadata.StandardWriteFilterFactory">
+      <params>
+        <maxEstimatedBytes>241</maxEstimatedBytes>
+        <includeFields>
+          <field>dc:creator</field>
+          <field>dc:title</field>
+        </includeFields>
+      </params>
+    </metadataWriteFilterFactory>
+  </autoDetectParserConfig>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
new file mode 100644
index 0000000..86ebe05
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+  <parsers>
+    <parser class="org.apache.tika.parser.DefaultParser"/>
+  </parsers>
+  <autoDetectParserConfig>
+    <params>
+      <spoolToDisk>12345</spoolToDisk>
+      <outputThreshold>6789</outputThreshold>
+    </params>
+    <metadataWriteFilterFactory class="org.apache.tika.metadata.StandardWriteFilterFactory">
+      <params>
+        <maxEstimatedBytes>241</maxEstimatedBytes>
+      </params>
+    </metadataWriteFilterFactory>
+  </autoDetectParserConfig>
+</properties>