You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/23 19:02:10 UTC

[tika] branch main updated: TIKA-3695 -- add a minmax limit for the always add and always set fields. Add a max values per field setting.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 9952019  TIKA-3695 -- add a minmax limit for the always add and always set fields.  Add a max values per field setting.
9952019 is described below

commit 9952019a5777e4b5db8ac8b6b21dc02df4c4094e
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 23 15:01:53 2022 -0400

    TIKA-3695 -- add a minmax limit for the always add and always set fields.  Add a max values per field setting.
---
 .../metadata/writefilter/StandardWriteFilter.java  | 83 ++++++++++++++--------
 .../writefilter/StandardWriteFilterFactory.java    | 32 +++++++--
 ...ilterTest.java => StandardWriteFilterTest.java} | 74 +++++++++++++++----
 .../org/apache/tika/config/TIKA-3695-fields.xml    |  1 +
 .../resources/org/apache/tika/config/TIKA-3695.xml |  2 +-
 5 files changed, 144 insertions(+), 48 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
index dd32356..f0e9f1f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
@@ -34,19 +34,21 @@ import org.apache.tika.utils.StringUtils;
 
 /**
  * This is to be used to limit the amount of metadata that a
- * parser can add based on the {@link #maxTotalEstimatedSize}. The
- * maxEstimatedSize is measured in UTF-16 bytes.
+ * parser can add based on the {@link #maxTotalEstimatedSize},
+ * {@link #maxFieldSize}, {@link #maxValuesPerField}, and
+ * {@link #maxKeySize}.  This can also be used to limit which
+ * fields are stored in the metadata object at write-time
+ * with {@link #includeFields}.
  *
- * The size is estimated as a rough order of magnitude of what is
+ * All sizes are measured in UTF-16 bytes. The size is estimated
+ * as a rough order of magnitude of what is
  * required to store the string in memory in Java.  We recognize
  * that Java uses more bytes to store length, offset etc. for strings. But
- * the extra overhead varies by java version and implementation,
+ * the extra overhead varies by Java version and implementation,
  * and we just need a basic estimate.  We also recognize actual
  * memory usage is affected by interning strings, etc.
- * Please forgive us or consider writing your own write filter. :)
+ * Please forgive us ... or consider writing your own write filter. :)
  *
- * This can also be used to limit which fields are stored
- * in the metadata object at write-time with {@link #includeFields}.
  *
  * <b>NOTE:</b> Fields in {@link #ALWAYS_SET_FIELDS} are
  * always set no matter the current state of {@link #maxTotalEstimatedSize}.
@@ -58,6 +60,12 @@ import org.apache.tika.utils.StringUtils;
  * Except for {@link TikaCoreProperties#TIKA_CONTENT}, each addition is truncated at
  * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}.
  *
+ * This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the
+ * {@link #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't
+ * have this and a user sets the {@link #maxFieldSize} to, say, 10 bytes,
+ * the internal parser behavior would be broken because parsers rely on
+ * {@link Metadata#CONTENT_TYPE} to determine which parser to call.
+ *
  * <b>NOTE:</b> as with {@link Metadata}, this object is not thread safe.
  */
 public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
@@ -91,11 +99,19 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
     private static final String TIKA_CONTENT_KEY = TikaCoreProperties.TIKA_CONTENT.getName();
     private static final String[] TRUE = new String[]{"true"};
 
+    //allow at least these many bytes in the "always" fields.
+    //As of 2022-03, the longest mime is 146.  Doubling that gives
+    //us some leeway.  If a mime is truncated, bad things will happen.
+    private final int minimumMaxFieldSizeInAlwaysFields = 300;
+
+
     private final boolean includeEmpty;
     private final int maxTotalEstimatedSize;
+    private final int maxValuesPerField;
     private final int maxFieldSize;
     private final int maxKeySize;
 
+
     private final Set<String> includeFields;
 
     private Map<String, Integer> fieldSizes = new HashMap<>();
@@ -112,16 +128,15 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
      * @param includeEmpty if <code>true</code>, this will set or add an empty value to the
      *                     metadata object.
      */
-    public StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
+    protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
+                               int maxValuesPerField,
                                Set<String> includeFields,
                                boolean includeEmpty) {
 
         this.maxKeySize = maxKeySize;
         this.maxFieldSize = maxFieldSize;
-        if (maxEstimatedSize < 0) {
-            throw new IllegalArgumentException("max estimated size must be > 0");
-        }
         this.maxTotalEstimatedSize = maxEstimatedSize;
+        this.maxValuesPerField = maxValuesPerField;
         this.includeFields = includeFields;
         this.includeEmpty = includeEmpty;
     }
@@ -170,14 +185,17 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
             data.put(field, new String[]{ value });
             return;
         }
-        int addedSize = estimateSize(value);
+        int sizeToAdd = estimateSize(value);
+        //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
+        //we do not want to truncate a mime!
+        int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize);
         String toSet = value;
-        if (addedSize > maxFieldSize) {
-            toSet = truncate(value, maxFieldSize, data);
-            addedSize = estimateSize(toSet);
+        if (sizeToAdd > alwaysMaxFieldLength) {
+            toSet = truncate(value, alwaysMaxFieldLength, data);
+            sizeToAdd = estimateSize(toSet);
         }
         int totalAdded = data.containsKey(field) ? 0 : estimateSize(field);
-        totalAdded += addedSize;
+        totalAdded += sizeToAdd;
         if (data.containsKey(field)) {
             String[] vals = data.get(field);
             //this should only ever be single valued!!!
@@ -198,24 +216,22 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
             setAlwaysInclude(field, value, data);
             return;
         }
-        int addedSize = estimateSize(value);
-        String toSet = value;
-        if (addedSize > maxFieldSize) {
-            toSet = truncate(value, maxFieldSize, data);
-            addedSize = estimateSize(toSet);
+        //TODO: should we limit the number of field values?
+
+        int toAddSize = estimateSize(value);
+        //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
+        //we do not want to truncate a mime!
+        int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize);
+        String toAddValue = value;
+        if (toAddSize > alwaysMaxFieldLength) {
+            toAddValue = truncate(value, alwaysMaxFieldLength, data);
+            toAddSize = estimateSize(toAddValue);
         }
         int totalAdded = data.containsKey(field) ? 0 : estimateSize(field);
-        totalAdded += addedSize;
-        if (data.containsKey(field)) {
-            String[] vals = data.get(field);
-            //this should only ever be single valued!!!
-            if (vals.length > 0) {
-                totalAdded -= estimateSize(vals[0]);
-            }
-        }
+        totalAdded += toAddSize;
         estimatedSize += totalAdded;
 
-        data.put(field, appendValue(data.get(field), toSet));
+        data.put(field, appendValue(data.get(field), toAddValue));
     }
 
 
@@ -256,6 +272,13 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
             return;
         }
 
+        String[] vals = data.get(filterKey.string);
+
+        if (vals != null && vals.length >= maxValuesPerField) {
+            setTruncated(data);
+            return;
+        }
+
         Integer fieldSizeInteger = fieldSizes.get(filterKey.string);
         int fieldSize = fieldSizeInteger == null ? 0 : fieldSizeInteger;
         int maxAllowed = maxAllowedToAdd(filterKey);
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index b53ba1c..b7d60b5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -31,16 +31,31 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
     public static int DEFAULT_MAX_KEY_SIZE = 1024;
     public static int DEFAULT_MAX_FIELD_SIZE = 100 * 1024;
     public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
+    public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
 
     private Set<String> includeFields = null;
     private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
     private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
     private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
+    private int maxValuesPerField = DEFAULT_MAX_VALUES_PER_FIELD;
     private boolean includeEmpty = false;
 
     public MetadataWriteFilter newInstance() {
+
+        if (maxFieldSize < 0) {
+            throw new IllegalArgumentException("maxFieldSize must be > 0");
+        }
+
+        if (maxValuesPerField < 1) {
+            throw new IllegalArgumentException("maxValuesPerField must be > 0");
+        }
+
+        if (maxTotalEstimatedBytes < 0) {
+            throw new IllegalArgumentException("max estimated size must be > 0");
+        }
+
         return new StandardWriteFilter(maxKeySize, maxFieldSize,
-                maxTotalEstimatedBytes, includeFields, includeEmpty);
+                maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty);
     }
 
     public void setIncludeFields(List<String> includeFields) {
@@ -65,8 +80,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
         this.includeEmpty = includeEmpty;
     }
 
-    public int getMaxTotalEstimatedBytes() {
-        return maxTotalEstimatedBytes;
+    public void setMaxValuesPerField(int maxValuesPerField) {
+        this.maxValuesPerField = maxValuesPerField;
     }
 
     public Set<String> getIncludeFields() {
@@ -81,6 +96,14 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
         return maxFieldSize;
     }
 
+    public int getMaxTotalEstimatedBytes() {
+        return maxTotalEstimatedBytes;
+    }
+
+    public int getMaxValuesPerField() {
+        return maxValuesPerField;
+    }
+
     public boolean isIncludeEmpty() {
         return includeEmpty;
     }
@@ -89,6 +112,7 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
     public String toString() {
         return "StandardWriteFilterFactory{" + "includeFields=" + includeFields + ", maxKeySize=" +
                 maxKeySize + ", maxFieldSize=" + maxFieldSize + ", maxTotalEstimatedBytes=" +
-                maxTotalEstimatedBytes + ", includeEmpty=" + includeEmpty + '}';
+                maxTotalEstimatedBytes + ", maxValuesPerField=" + maxValuesPerField +
+                ", includeEmpty=" + includeEmpty + '}';
     }
 }
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
similarity index 77%
rename from tika-core/src/test/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterTest.java
rename to tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
index 395fe7e..7b7e871 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterTest.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
@@ -32,13 +32,14 @@ import org.apache.tika.config.TikaConfigTest;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
-import org.apache.tika.metadata.writefilter.StandardWriteFilterFactory;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.mime.MimeTypes;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.AutoDetectParserConfig;
 import org.apache.tika.parser.ParseContext;
 
-public class MetadataWriteFilterTest extends TikaTest {
+public class StandardWriteFilterTest extends TikaTest {
 
 
     @Test
@@ -47,7 +48,7 @@ public class MetadataWriteFilterTest extends TikaTest {
                 new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
         AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
         MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
-        assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes());
+        assertEquals(350, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes());
         AutoDetectParser parser = new AutoDetectParser(tikaConfig);
         String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
                 "<mock>";
@@ -64,10 +65,10 @@ public class MetadataWriteFilterTest extends TikaTest {
         metadata = metadataList.get(0);
 
         String[] creators = metadata.getValues("dc:creator");
-        assertEquals(2, creators.length);
-        assertEquals("0123", creators[1]);
+        assertEquals(3, creators.length);
+        assertEquals("01", creators[2]);
         assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
-        assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+        assertTruncated(metadata);
     }
 
     @Test
@@ -109,12 +110,13 @@ public class MetadataWriteFilterTest extends TikaTest {
         assertEquals(3, creators.length);
         assertEquals("012345678901234", creators[2]);
         assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
-        assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+        assertTruncated(metadata);
     }
 
     @Test
     public void testKeySizeFilter() throws Exception {
-        Metadata metadata = filter(10, 1000, 10000, null, true);
+        Metadata metadata = filter(10, 1000, 10000, 100,
+                null, true);
         //test that must add keys are not truncated
         metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
         metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
@@ -124,21 +126,25 @@ public class MetadataWriteFilterTest extends TikaTest {
         metadata.add(OfficeOpenXMLExtended.DOC_SECURITY_STRING, "some doc-security-string");
         //truncated to 10 bytes in UTF-16 = 5 characters
         assertEquals("some doc-security-string", metadata.getValues("exten")[0]);
+        assertTruncated(metadata);
 
         metadata.set(OfficeOpenXMLExtended.APP_VERSION, "some other string");
         assertEquals("some other string", metadata.getValues("exten")[0]);
+        assertTruncated(metadata);
     }
 
     @Test
     public void testAfterMaxHit() throws Exception {
         String k = "dc:creator";//20 bytes
         //key is > maxTotalBytes, so the value isn't even added
-        Metadata metadata = filter(100, 10000, 10, null, false);
+        Metadata metadata = filter(100, 10000, 10,
+                100, null, false);
         metadata.set(k, "ab");
         assertEquals(1, metadata.names().length);
         assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
 
-        metadata = filter(100, 10000, 50, null, false);
+        metadata = filter(100, 10000, 50, 100,
+                null, false);
         for (int i = 0; i < 10; i++) {
             metadata.set(k, "abcde");
         }
@@ -166,17 +172,59 @@ public class MetadataWriteFilterTest extends TikaTest {
         assertEquals(2, metadata.names().length);
         assertEquals(1, metadata.getValues(k).length);
         assertEquals("abcdefghijklmno", metadata.getValues(k)[0]);
-        assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+        assertTruncated(metadata);
     }
 
+    @Test
+    public void testMinSizeForAlwaysInclude() throws Exception {
+        //test that mimes don't get truncated
+        Metadata metadata = filter(100, 10, 10000, 100, null, true);
+
+        String mime = getLongestMime().toString();
+        metadata.set(Metadata.CONTENT_TYPE, mime);
+        assertEquals(mime, metadata.get(Metadata.CONTENT_TYPE));
+
+        //test that other fields are truncated
+        metadata.set("dc:title", "abcdefghij");
+        assertEquals("abcde", metadata.get("dc:title"));
+        assertTruncated(metadata);
+    }
+
+    @Test
+    public void testMaxFieldValues() throws Exception {
+        Metadata metadata = filter(100, 10000, 10000, 3, null, true);
+        for (int i = 0; i < 10; i++) {
+            metadata.add(TikaCoreProperties.SUBJECT, "ab");
+        }
+        assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length);
+    }
+
+    private void assertTruncated(Metadata metadata) {
+        assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+    }
     private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes,
+                            int maxValuesPerField,
                             Set<String> includeFields, boolean includeEmpty) {
         MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize,
-                maxTotalBytes, includeFields, includeEmpty);
+                maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
         Metadata metadata = new Metadata();
         metadata.setMetadataWriteFilter(filter);
         return metadata;
     }
 
+    public MediaType getLongestMime() throws Exception {
+        MimeTypes types = TikaConfig.getDefaultConfig().getMimeRepository();
+        MediaTypeRegistry registry = types.getMediaTypeRegistry();
+        int maxLength = -1;
+        MediaType longest = null;
+        for (MediaType mt : registry.getTypes()) {
+            int len = mt.toString().length() * 2;
+            if (len > maxLength) {
+                maxLength = len;
+                longest = mt;
+            }
+        }
+        return longest;
+    }
 
 }
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
index 2c73466..7842e3b 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
@@ -29,6 +29,7 @@
         <maxKeySize>999</maxKeySize>
         <maxFieldSize>10001</maxFieldSize>
         <maxTotalEstimatedBytes>241</maxTotalEstimatedBytes>
+        <maxValuesPerField>100</maxValuesPerField>
         <includeFields>
           <field>dc:creator</field>
           <field>dc:title</field>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
index 5d435d9..506913f 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
@@ -26,7 +26,7 @@
     </params>
     <metadataWriteFilterFactory class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
       <params>
-        <maxTotalEstimatedBytes>241</maxTotalEstimatedBytes>
+        <maxTotalEstimatedBytes>350</maxTotalEstimatedBytes>
       </params>
     </metadataWriteFilterFactory>
   </autoDetectParserConfig>