You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/23 19:02:10 UTC
[tika] branch main updated: TIKA-3695 -- add a minmax limit for the always add and always set fields. Add a max values per field setting.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9952019 TIKA-3695 -- add a minmax limit for the always add and always set fields. Add a max values per field setting.
9952019 is described below
commit 9952019a5777e4b5db8ac8b6b21dc02df4c4094e
Author: tallison <ta...@apache.org>
AuthorDate: Wed Mar 23 15:01:53 2022 -0400
TIKA-3695 -- add a minmax limit for the always add and always set fields. Add a max values per field setting.
---
.../metadata/writefilter/StandardWriteFilter.java | 83 ++++++++++++++--------
.../writefilter/StandardWriteFilterFactory.java | 32 +++++++--
...ilterTest.java => StandardWriteFilterTest.java} | 74 +++++++++++++++----
.../org/apache/tika/config/TIKA-3695-fields.xml | 1 +
.../resources/org/apache/tika/config/TIKA-3695.xml | 2 +-
5 files changed, 144 insertions(+), 48 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
index dd32356..f0e9f1f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
@@ -34,19 +34,21 @@ import org.apache.tika.utils.StringUtils;
/**
* This is to be used to limit the amount of metadata that a
- * parser can add based on the {@link #maxTotalEstimatedSize}. The
- * maxEstimatedSize is measured in UTF-16 bytes.
+ * parser can add based on the {@link #maxTotalEstimatedSize},
+ * {@link #maxFieldSize}, {@link #maxValuesPerField}, and
+ * {@link #maxKeySize}. This can also be used to limit which
+ * fields are stored in the metadata object at write-time
+ * with {@link #includeFields}.
*
- * The size is estimated as a rough order of magnitude of what is
+ * All sizes are measured in UTF-16 bytes. The size is estimated
+ * as a rough order of magnitude of what is
* required to store the string in memory in Java. We recognize
* that Java uses more bytes to store length, offset etc. for strings. But
- * the extra overhead varies by java version and implementation,
+ * the extra overhead varies by Java version and implementation,
* and we just need a basic estimate. We also recognize actual
* memory usage is affected by interning strings, etc.
- * Please forgive us or consider writing your own write filter. :)
+ * Please forgive us ... or consider writing your own write filter. :)
*
- * This can also be used to limit which fields are stored
- * in the metadata object at write-time with {@link #includeFields}.
*
* <b>NOTE:</b> Fields in {@link #ALWAYS_SET_FIELDS} are
* always set no matter the current state of {@link #maxTotalEstimatedSize}.
@@ -58,6 +60,12 @@ import org.apache.tika.utils.StringUtils;
* Except for {@link TikaCoreProperties#TIKA_CONTENT}, each addition is truncated at
* {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}.
*
+ * This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the
+ * {@link #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't
+ * have this and a user sets the {@link #maxFieldSize} to, say, 10 bytes,
+ * the internal parser behavior would be broken because parsers rely on
+ * {@link Metadata#CONTENT_TYPE} to determine which parser to call.
+ *
* <b>NOTE:</b> as with {@link Metadata}, this object is not thread safe.
*/
public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
@@ -91,11 +99,19 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
private static final String TIKA_CONTENT_KEY = TikaCoreProperties.TIKA_CONTENT.getName();
private static final String[] TRUE = new String[]{"true"};
+ //allow at least these many bytes in the "always" fields.
+ //As of 2022-03, the longest mime is 146. Doubling that gives
+ //us some leeway. If a mime is truncated, bad things will happen.
+ private final int minimumMaxFieldSizeInAlwaysFields = 300;
+
+
private final boolean includeEmpty;
private final int maxTotalEstimatedSize;
+ private final int maxValuesPerField;
private final int maxFieldSize;
private final int maxKeySize;
+
private final Set<String> includeFields;
private Map<String, Integer> fieldSizes = new HashMap<>();
@@ -112,16 +128,15 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
* @param includeEmpty if <code>true</code>, this will set or add an empty value to the
* metadata object.
*/
- public StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
+ protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
+ int maxValuesPerField,
Set<String> includeFields,
boolean includeEmpty) {
this.maxKeySize = maxKeySize;
this.maxFieldSize = maxFieldSize;
- if (maxEstimatedSize < 0) {
- throw new IllegalArgumentException("max estimated size must be > 0");
- }
this.maxTotalEstimatedSize = maxEstimatedSize;
+ this.maxValuesPerField = maxValuesPerField;
this.includeFields = includeFields;
this.includeEmpty = includeEmpty;
}
@@ -170,14 +185,17 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
data.put(field, new String[]{ value });
return;
}
- int addedSize = estimateSize(value);
+ int sizeToAdd = estimateSize(value);
+ //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
+ //we do not want to truncate a mime!
+ int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize);
String toSet = value;
- if (addedSize > maxFieldSize) {
- toSet = truncate(value, maxFieldSize, data);
- addedSize = estimateSize(toSet);
+ if (sizeToAdd > alwaysMaxFieldLength) {
+ toSet = truncate(value, alwaysMaxFieldLength, data);
+ sizeToAdd = estimateSize(toSet);
}
int totalAdded = data.containsKey(field) ? 0 : estimateSize(field);
- totalAdded += addedSize;
+ totalAdded += sizeToAdd;
if (data.containsKey(field)) {
String[] vals = data.get(field);
//this should only ever be single valued!!!
@@ -198,24 +216,22 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
setAlwaysInclude(field, value, data);
return;
}
- int addedSize = estimateSize(value);
- String toSet = value;
- if (addedSize > maxFieldSize) {
- toSet = truncate(value, maxFieldSize, data);
- addedSize = estimateSize(toSet);
+ //TODO: should we limit the number of field values?
+
+ int toAddSize = estimateSize(value);
+ //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
+ //we do not want to truncate a mime!
+ int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize);
+ String toAddValue = value;
+ if (toAddSize > alwaysMaxFieldLength) {
+ toAddValue = truncate(value, alwaysMaxFieldLength, data);
+ toAddSize = estimateSize(toAddValue);
}
int totalAdded = data.containsKey(field) ? 0 : estimateSize(field);
- totalAdded += addedSize;
- if (data.containsKey(field)) {
- String[] vals = data.get(field);
- //this should only ever be single valued!!!
- if (vals.length > 0) {
- totalAdded -= estimateSize(vals[0]);
- }
- }
+ totalAdded += toAddSize;
estimatedSize += totalAdded;
- data.put(field, appendValue(data.get(field), toSet));
+ data.put(field, appendValue(data.get(field), toAddValue));
}
@@ -256,6 +272,13 @@ public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
return;
}
+ String[] vals = data.get(filterKey.string);
+
+ if (vals != null && vals.length >= maxValuesPerField) {
+ setTruncated(data);
+ return;
+ }
+
Integer fieldSizeInteger = fieldSizes.get(filterKey.string);
int fieldSize = fieldSizeInteger == null ? 0 : fieldSizeInteger;
int maxAllowed = maxAllowedToAdd(filterKey);
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index b53ba1c..b7d60b5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -31,16 +31,31 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
public static int DEFAULT_MAX_KEY_SIZE = 1024;
public static int DEFAULT_MAX_FIELD_SIZE = 100 * 1024;
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
+ public static int DEFAULT_MAX_VALUES_PER_FIELD = 10;
private Set<String> includeFields = null;
private int maxKeySize = DEFAULT_MAX_KEY_SIZE;
private int maxFieldSize = DEFAULT_MAX_FIELD_SIZE;
private int maxTotalEstimatedBytes = DEFAULT_TOTAL_ESTIMATED_BYTES;
+ private int maxValuesPerField = DEFAULT_MAX_VALUES_PER_FIELD;
private boolean includeEmpty = false;
public MetadataWriteFilter newInstance() {
+
+ if (maxFieldSize < 0) {
+ throw new IllegalArgumentException("maxFieldSize must be > 0");
+ }
+
+ if (maxValuesPerField < 1) {
+ throw new IllegalArgumentException("maxValuesPerField must be > 0");
+ }
+
+ if (maxTotalEstimatedBytes < 0) {
+ throw new IllegalArgumentException("max estimated size must be > 0");
+ }
+
return new StandardWriteFilter(maxKeySize, maxFieldSize,
- maxTotalEstimatedBytes, includeFields, includeEmpty);
+ maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty);
}
public void setIncludeFields(List<String> includeFields) {
@@ -65,8 +80,8 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
this.includeEmpty = includeEmpty;
}
- public int getMaxTotalEstimatedBytes() {
- return maxTotalEstimatedBytes;
+ public void setMaxValuesPerField(int maxValuesPerField) {
+ this.maxValuesPerField = maxValuesPerField;
}
public Set<String> getIncludeFields() {
@@ -81,6 +96,14 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
return maxFieldSize;
}
+ public int getMaxTotalEstimatedBytes() {
+ return maxTotalEstimatedBytes;
+ }
+
+ public int getMaxValuesPerField() {
+ return maxValuesPerField;
+ }
+
public boolean isIncludeEmpty() {
return includeEmpty;
}
@@ -89,6 +112,7 @@ public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
public String toString() {
return "StandardWriteFilterFactory{" + "includeFields=" + includeFields + ", maxKeySize=" +
maxKeySize + ", maxFieldSize=" + maxFieldSize + ", maxTotalEstimatedBytes=" +
- maxTotalEstimatedBytes + ", includeEmpty=" + includeEmpty + '}';
+ maxTotalEstimatedBytes + ", maxValuesPerField=" + maxValuesPerField +
+ ", includeEmpty=" + includeEmpty + '}';
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
similarity index 77%
rename from tika-core/src/test/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterTest.java
rename to tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
index 395fe7e..7b7e871 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/MetadataWriteFilterTest.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
@@ -32,13 +32,14 @@ import org.apache.tika.config.TikaConfigTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
-import org.apache.tika.metadata.writefilter.StandardWriteFilterFactory;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.mime.MimeTypes;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.AutoDetectParserConfig;
import org.apache.tika.parser.ParseContext;
-public class MetadataWriteFilterTest extends TikaTest {
+public class StandardWriteFilterTest extends TikaTest {
@Test
@@ -47,7 +48,7 @@ public class MetadataWriteFilterTest extends TikaTest {
new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
- assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes());
+ assertEquals(350, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes());
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
"<mock>";
@@ -64,10 +65,10 @@ public class MetadataWriteFilterTest extends TikaTest {
metadata = metadataList.get(0);
String[] creators = metadata.getValues("dc:creator");
- assertEquals(2, creators.length);
- assertEquals("0123", creators[1]);
+ assertEquals(3, creators.length);
+ assertEquals("01", creators[2]);
assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
- assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+ assertTruncated(metadata);
}
@Test
@@ -109,12 +110,13 @@ public class MetadataWriteFilterTest extends TikaTest {
assertEquals(3, creators.length);
assertEquals("012345678901234", creators[2]);
assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
- assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+ assertTruncated(metadata);
}
@Test
public void testKeySizeFilter() throws Exception {
- Metadata metadata = filter(10, 1000, 10000, null, true);
+ Metadata metadata = filter(10, 1000, 10000, 100,
+ null, true);
//test that must add keys are not truncated
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
@@ -124,21 +126,25 @@ public class MetadataWriteFilterTest extends TikaTest {
metadata.add(OfficeOpenXMLExtended.DOC_SECURITY_STRING, "some doc-security-string");
//truncated to 10 bytes in UTF-16 = 5 characters
assertEquals("some doc-security-string", metadata.getValues("exten")[0]);
+ assertTruncated(metadata);
metadata.set(OfficeOpenXMLExtended.APP_VERSION, "some other string");
assertEquals("some other string", metadata.getValues("exten")[0]);
+ assertTruncated(metadata);
}
@Test
public void testAfterMaxHit() throws Exception {
String k = "dc:creator";//20 bytes
//key is > maxTotalBytes, so the value isn't even added
- Metadata metadata = filter(100, 10000, 10, null, false);
+ Metadata metadata = filter(100, 10000, 10,
+ 100, null, false);
metadata.set(k, "ab");
assertEquals(1, metadata.names().length);
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
- metadata = filter(100, 10000, 50, null, false);
+ metadata = filter(100, 10000, 50, 100,
+ null, false);
for (int i = 0; i < 10; i++) {
metadata.set(k, "abcde");
}
@@ -166,17 +172,59 @@ public class MetadataWriteFilterTest extends TikaTest {
assertEquals(2, metadata.names().length);
assertEquals(1, metadata.getValues(k).length);
assertEquals("abcdefghijklmno", metadata.getValues(k)[0]);
- assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+ assertTruncated(metadata);
}
+ @Test
+ public void testMinSizeForAlwaysInclude() throws Exception {
+ //test that mimes don't get truncated
+ Metadata metadata = filter(100, 10, 10000, 100, null, true);
+
+ String mime = getLongestMime().toString();
+ metadata.set(Metadata.CONTENT_TYPE, mime);
+ assertEquals(mime, metadata.get(Metadata.CONTENT_TYPE));
+
+ //test that other fields are truncated
+ metadata.set("dc:title", "abcdefghij");
+ assertEquals("abcde", metadata.get("dc:title"));
+ assertTruncated(metadata);
+ }
+
+ @Test
+ public void testMaxFieldValues() throws Exception {
+ Metadata metadata = filter(100, 10000, 10000, 3, null, true);
+ for (int i = 0; i < 10; i++) {
+ metadata.add(TikaCoreProperties.SUBJECT, "ab");
+ }
+ assertEquals(3, metadata.getValues(TikaCoreProperties.SUBJECT).length);
+ }
+
+ private void assertTruncated(Metadata metadata) {
+ assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
+ }
private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes,
+ int maxValuesPerField,
Set<String> includeFields, boolean includeEmpty) {
MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize,
- maxTotalBytes, includeFields, includeEmpty);
+ maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
Metadata metadata = new Metadata();
metadata.setMetadataWriteFilter(filter);
return metadata;
}
+ public MediaType getLongestMime() throws Exception {
+ MimeTypes types = TikaConfig.getDefaultConfig().getMimeRepository();
+ MediaTypeRegistry registry = types.getMediaTypeRegistry();
+ int maxLength = -1;
+ MediaType longest = null;
+ for (MediaType mt : registry.getTypes()) {
+ int len = mt.toString().length() * 2;
+ if (len > maxLength) {
+ maxLength = len;
+ longest = mt;
+ }
+ }
+ return longest;
+ }
}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
index 2c73466..7842e3b 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
@@ -29,6 +29,7 @@
<maxKeySize>999</maxKeySize>
<maxFieldSize>10001</maxFieldSize>
<maxTotalEstimatedBytes>241</maxTotalEstimatedBytes>
+ <maxValuesPerField>100</maxValuesPerField>
<includeFields>
<field>dc:creator</field>
<field>dc:title</field>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
index 5d435d9..506913f 100644
--- a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
@@ -26,7 +26,7 @@
</params>
<metadataWriteFilterFactory class="org.apache.tika.metadata.writefilter.StandardWriteFilterFactory">
<params>
- <maxTotalEstimatedBytes>241</maxTotalEstimatedBytes>
+ <maxTotalEstimatedBytes>350</maxTotalEstimatedBytes>
</params>
</metadataWriteFilterFactory>
</autoDetectParserConfig>