You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/03/17 21:38:05 UTC
[tika] branch main updated: TIKA-3695 (#534)
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 000abdc TIKA-3695 (#534)
000abdc is described below
commit 000abdcf70112df1a2a9a433e308c1fe5db1d45e
Author: Tim Allison <ta...@apache.org>
AuthorDate: Thu Mar 17 17:37:57 2022 -0400
TIKA-3695 (#534)
* TIKA-3695 -- initial commit. WIP do not merge.
* TIKA-3695 -- add fields example and test
* TIKA-3695 -- implement filterExisting in StandardWriteFilter. Create standalone unit test.
---
CHANGES.txt | 3 +
.../java/org/apache/tika/metadata/Metadata.java | 94 +++++++++-
.../apache/tika/metadata/MetadataWriteFilter.java | 42 +++++
.../tika/metadata/MetadataWriteFilterFactory.java | 21 +++
.../apache/tika/metadata/StandardWriteFilter.java | 199 +++++++++++++++++++++
.../tika/metadata/StandardWriteFilterFactory.java | 58 ++++++
.../apache/tika/metadata/TikaCoreProperties.java | 4 +
.../org/apache/tika/parser/AutoDetectParser.java | 3 +
.../apache/tika/parser/AutoDetectParserConfig.java | 11 ++
.../org/apache/tika/config/TikaConfigTest.java | 1 -
.../tika/metadata/MetadataWriteFilterTest.java | 106 +++++++++++
.../org/apache/tika/config/TIKA-3695-fields.xml | 37 ++++
.../resources/org/apache/tika/config/TIKA-3695.xml | 33 ++++
13 files changed, 607 insertions(+), 5 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index a78be3e..eea54b3 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -6,6 +6,9 @@ Release 2.4.0 - ???
https://github.com/apache/tika/blob/main/tika-parsers/tika-parsers-ml/tika-dl/pom.xml
for the dependencies that must be provided at run-time (TIKA-3676).
+ * Add MetadataWriteFilter capability to improve memory profile in
+ Metadata objects (TIKA-3695).
+
* Add detection for Frictionless Data packages and WACZ (TIKA-3696).
* Upgrade deeplearning4j to 1.0.0-M2 (TIKA-3458 and PR#527).
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index bfed8ef..da71504 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -44,6 +44,8 @@ public class Metadata
implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, TIFF,
TikaMimeKeys, Serializable {
+ private static final MetadataWriteFilter ACCEPT_ALL = new AcceptAll();
+
/**
* Serial version UID
*/
@@ -58,6 +60,8 @@ public class Metadata
*/
private Map<String, String[]> metadata = null;
+ //TODO: transient?
+ private MetadataWriteFilter writeFilter = ACCEPT_ALL;
/**
* Constructs a new, empty metadata.
*/
@@ -132,6 +136,23 @@ public class Metadata
}
/**
+ * Sets the writeFilter that is called before {@link #set(String, String)}
+ * {@link #set(String, String[])}, {@link #add(String, String)},
+ * {@link #add(String, String[])}. The default is {@link #ACCEPT_ALL}.
+ *
+ * This is intended for expert use only. Some parsers rely on metadata
+ * during the parse, and if the metadata they need is excluded, they
+ * will not function properly.
+ *
+ * @param writeFilter
+ * @since 2.4.0
+ */
+ public void setMetadataWriteFilter(MetadataWriteFilter writeFilter) {
+ this.writeFilter = writeFilter;
+ this.writeFilter.filterExisting(metadata);
+ }
+
+ /**
* Returns the value (if any) of the identified metadata property.
*
* @param property property definition
@@ -236,11 +257,35 @@ public class Metadata
* @param value the metadata value.
*/
public void add(final String name, final String value) {
+ if (!writeFilter.include(name, value)) {
+ return;
+ }
String[] values = metadata.get(name);
if (values == null) {
set(name, value);
} else {
- metadata.put(name, appendedValues(values, value));
+ String filtered = writeFilter.filter(name, value, metadata);
+ if (filtered != null) {
+ metadata.put(name, appendedValues(values, filtered));
+ }
+ }
+ }
+
+ /**
+ * Add a metadata name/value mapping. Add the specified value to the list of
+ * values associated to the specified metadata name.
+ *
+ * @param name the metadata name.
+ * @param newValues the metadata values
+ */
+ protected void add(final String name, final String[] newValues) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ set(name, newValues);
+ } else {
+ for (String val : newValues) {
+ add(name, val);
+ }
}
}
@@ -270,7 +315,7 @@ public class Metadata
set(property, value);
} else {
if (property.isMultiValuePermitted()) {
- set(property, appendedValues(values, value));
+ add(property.getName(), value);
} else {
throw new PropertyTypeException(
property.getName() + " : " + property.getPropertyType());
@@ -303,8 +348,28 @@ public class Metadata
* @param value the metadata value, or <code>null</code>
*/
public void set(String name, String value) {
+ if (! writeFilter.include(name, value)) {
+ return;
+ }
if (value != null) {
- metadata.put(name, new String[]{value});
+ metadata.remove(name);
+ String filtered = writeFilter.filter(name, value, metadata);
+ if (filtered != null) {
+ metadata.put(name, new String[]{filtered});
+ }
+ } else {
+ metadata.remove(name);
+ }
+ }
+
+ protected void set(String name, String[] values) {
+ //TODO: optimize this to not copy if all
+ //values are to be included "as is"
+ if (values != null) {
+ metadata.remove(name);
+ for (String v : values) {
+ add(name, v);
+ }
} else {
metadata.remove(name);
}
@@ -352,7 +417,7 @@ public class Metadata
}
}
} else {
- metadata.put(property.getName(), values);
+ set(property.getName(), values);
}
}
@@ -599,4 +664,25 @@ public class Metadata
return buf.toString();
}
+ /**
+ * NO-OP write filter that accepts everything without modification.
+ */
+ private static class AcceptAll implements MetadataWriteFilter, Serializable {
+
+ @Override
+ public void filterExisting(Map<String, String[]> data) {
+ return;
+ }
+
+ @Override
+ public boolean include(String field, String value) {
+ return true;
+ }
+
+ @Override
+ public String filter(String field, String value, Map<String, String[]> data) {
+ return value;
+ }
+ }
+
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilter.java
new file mode 100644
index 0000000..7918a63
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilter.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.util.Map;
+
+public interface MetadataWriteFilter {
+
+ void filterExisting(Map<String, String[]> data);
+
+ boolean include(String field, String value);
+
+ /**
+ * Based on the field and value, this filter modifies the value
+ * to something that should be set or added to the Metadata object.
+ *
+ * If the value is <code>null</code>, no value is set or added.
+ *
+ * Status updates (e.g. write limit reached) can be added directly to the
+ * underlying metadata.
+ *
+ * @param field
+ * @param value
+ * @param data
+ * @return
+ */
+ String filter(String field, String value, Map<String, String[]> data);
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilterFactory.java
new file mode 100644
index 0000000..9c9c0c1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MetadataWriteFilterFactory.java
@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+public interface MetadataWriteFilterFactory {
+ MetadataWriteFilter newInstance();
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
new file mode 100644
index 0000000..4a26ebb
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilter.java
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.io.Serializable;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.utils.StringUtils;
+
+/**
+ * This is to be used to limit the amount of metadata that a
+ * parser can add based on the {@link #maxEstimatedSize}. The
+ * maxEstimatedSize is measured in UTF-8 bytes.
+ *
+ * This can also be used to limit the fields that are stored
+ * in the metadata object at write-time with {@link #includeFields}.
+ *
+ * <b>NOTE:</b> Fields in {@link #ALWAYS_INCLUDE_FIELDS} are never
+ * always included, and their sizes are not included in the
+ * calculation of metadata size.
+ *
+ * <b>NOTE:</b> after the maxEstimatedSize has been hit, no
+ * further modifications to the metadata object will be allowed aside
+ * from adding/setting fields in the {@link #ALWAYS_INCLUDE_FIELDS}.
+ *
+ * <b>NOTE:</b> as with {@link Metadata}, this object is not thread safe.
+ */
+public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
+
+ public static final Set<String> ALWAYS_INCLUDE_FIELDS = new HashSet<>();
+
+ static {
+ ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_LENGTH);
+ ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_TYPE);
+ ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_ENCODING);
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_USER_OVERRIDE.getName());
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE.getName());
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.CONTENT_TYPE_HINT.getName());
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.TIKA_CONTENT.getName());
+ ALWAYS_INCLUDE_FIELDS.add(TikaCoreProperties.RESOURCE_NAME_KEY);
+ ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_CONTENT.getName());
+ ALWAYS_INCLUDE_FIELDS.add(AccessPermissions.EXTRACT_FOR_ACCESSIBILITY.getName());
+ ALWAYS_INCLUDE_FIELDS.add(Metadata.CONTENT_DISPOSITION);
+ //Metadata.CONTENT_LOCATION? used by the html parser
+ }
+
+ private final boolean includeEmpty;
+ private final int maxEstimatedSize;
+
+ private final Set<String> includeFields;
+
+ //tracks the estimated size in utf8 bytes. Can be > maxEstimated size
+ int estimatedSize = 0;
+
+ /**
+ *
+ * @param maxEstimatedSize
+ * @param includeFields if null or empty, all fields are included; otherwise, which fields
+ * to add to the metadata object.
+ * @param includeEmpty if <code>true</code>, this will set or add an empty value to the
+ * metadata object.
+ */
+ public StandardWriteFilter(int maxEstimatedSize, Set<String> includeFields,
+ boolean includeEmpty) {
+ if (maxEstimatedSize < 0) {
+ throw new IllegalArgumentException("max estimated size must be > 0");
+ }
+ this.maxEstimatedSize = maxEstimatedSize;
+ this.includeFields = includeFields;
+ this.includeEmpty = includeEmpty;
+ }
+
+ @Override
+ public void filterExisting(Map<String, String[]> data) {
+ //this is somewhat costly, but it ensures that
+ //metadata that was placed in the metadata object before this
+ //filter was applied is removed.
+ //It should only be called once, and probably not on that
+ //many fields.
+ Set<String> toRemove = new HashSet<>();
+ for (String n : data.keySet()) {
+ if (! includeField(n)) {
+ toRemove.add(n);
+ }
+ }
+
+ for (String n : toRemove) {
+ data.remove(n);
+ }
+
+ for (String n : data.keySet()) {
+ String[] vals = data.get(n);
+ List<String> filteredVals = new ArrayList<>();
+ for (int i = 0; i < vals.length; i++) {
+ String v = vals[i];
+ if (include(n, v)) {
+ String filtered = filter(n, v, data);
+ if (filtered != null) {
+ filteredVals.add(filtered);
+ }
+ }
+ }
+ data.put(n, filteredVals.toArray(new String[0]));
+ }
+ }
+
+ @Override
+ public boolean include(String field, String value) {
+ return includeField(field) && includeValue(value);
+ }
+
+ @Override
+ public String filter(String field, String value, Map<String, String[]> data) {
+ if (ALWAYS_INCLUDE_FIELDS.contains(field)) {
+ return value;
+ }
+ if (estimatedSize > maxEstimatedSize) {
+ return null;
+ }
+ long length = value.getBytes(StandardCharsets.UTF_8).length;
+ String toWrite = value;
+ if (estimatedSize + length > maxEstimatedSize) {
+ toWrite = truncate(value);
+ data.put(TikaCoreProperties.METADATA_LIMIT_REACHED.getName(), new String[]{"true"});
+ }
+ //this will by default bump the estimated size over what was actually written
+ //we are currently only using this as an indicator of whether to even try to write more.
+ //this value is not necessarily accurate.
+ estimatedSize += length;
+ return toWrite;
+ }
+
+ private String truncate(String value) {
+ //correctly handle multibyte characters
+ int available = maxEstimatedSize - estimatedSize;
+ byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+ ByteBuffer bb = ByteBuffer.wrap(bytes, 0, available);
+ CharBuffer cb = CharBuffer.allocate(available);
+ CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder();
+ // Ignore last (potentially) incomplete character
+ decoder.onMalformedInput(CodingErrorAction.IGNORE);
+ decoder.decode(bb, cb, true);
+ decoder.flush(cb);
+ return new String(cb.array(), 0, cb.position());
+ }
+
+
+ /**
+ * Tests for null or empty. Does not check for length
+ * @param value
+ * @return
+ */
+ private boolean includeValue(String value) {
+ if (includeEmpty) {
+ return true;
+ }
+ if (StringUtils.isBlank(value)) {
+ return false;
+ }
+ return true;
+ }
+
+ private boolean includeField(String name) {
+ if (ALWAYS_INCLUDE_FIELDS.contains(name)) {
+ return true;
+ }
+ if (includeFields == null) {
+ return true;
+ }
+ if (includeFields.contains(name)) {
+ return true;
+ }
+ return false;
+ }
+
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
new file mode 100644
index 0000000..3c9e6a1
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/metadata/StandardWriteFilterFactory.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
+
+ public static int DEFAULT_MAX_SIZE = 10 * 1024 * 1024;
+ private Set<String> includeFields = null;
+ private int maxEstimatedBytes = DEFAULT_MAX_SIZE;
+ private boolean includeEmpty = false;
+
+ public MetadataWriteFilter newInstance() {
+ return new StandardWriteFilter(maxEstimatedBytes, includeFields, includeEmpty);
+ }
+
+ public void setIncludeFields(List<String> includeFields) {
+ Set<String> keys = ConcurrentHashMap.newKeySet(includeFields.size());
+ keys.addAll(includeFields);
+ this.includeFields = Collections.unmodifiableSet(keys);
+ }
+
+ public void setMaxEstimatedBytes(int maxEstimatedBytes) {
+ this.maxEstimatedBytes = maxEstimatedBytes;
+ }
+
+ public void setIncludeEmpty(boolean includeEmpty) {
+ this.includeEmpty = includeEmpty;
+ }
+
+ @Override
+ public String toString() {
+ return "WriteFilteringMetadataFactory{" + "includeFields=" + includeFields +
+ ", maxEstimatedBytes=" + maxEstimatedBytes + ", includeEmpty=" + includeEmpty + '}';
+ }
+
+ public int getMaxEstimatedBytes() {
+ return maxEstimatedBytes;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index f6025c3..ea08f75 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -81,6 +81,10 @@ public interface TikaCoreProperties {
*/
Property TIKA_META_EXCEPTION_WARNING =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn");
+
+ Property METADATA_LIMIT_REACHED =
+ Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "metadata_limit_reached");
+
/**
* Use this to store exceptions caught while trying to read the
* stream of an embedded resource. Do not use this if there is
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 099536b..7650b23 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -129,6 +129,9 @@ public class AutoDetectParser extends CompositeParser {
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
+ if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) {
+ metadata.setMetadataWriteFilter(autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
+ }
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 32ae7c4..1115365 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -23,6 +23,7 @@ import org.w3c.dom.Element;
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.metadata.MetadataWriteFilterFactory;
/**
* This config object can be used to tune how conservative we want to be
@@ -69,6 +70,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
*/
private Integer maximumPackageEntryDepth = null;
+ private MetadataWriteFilterFactory metadataWriteFilterFactory = null;
+
/**
* Creates a SecureContentHandlerConfig using the passed in parameters.
*
@@ -131,5 +134,13 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
public void setMaximumPackageEntryDepth(int maximumPackageEntryDepth) {
this.maximumPackageEntryDepth = maximumPackageEntryDepth;
}
+
+ public MetadataWriteFilterFactory getMetadataWriteFilterFactory() {
+ return this.metadataWriteFilterFactory;
+ }
+
+ public void setMetadataWriteFilterFactory(MetadataWriteFilterFactory metadataWriteFilterFactory) {
+ this.metadataWriteFilterFactory = metadataWriteFilterFactory;
+ }
}
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 5c8cbfd..2f213ff 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -374,7 +374,6 @@ public class TikaConfigTest extends AbstractTikaConfigTest {
});
}
-
@Test
public void testTimesInitiated() throws Exception {
//this prevents multi-threading tests, but we aren't doing that now...
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
new file mode 100644
index 0000000..54175ac
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/metadata/MetadataWriteFilterTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNull;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.config.TikaConfigTest;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.AutoDetectParserConfig;
+import org.apache.tika.parser.ParseContext;
+
+public class MetadataWriteFilterTest extends TikaTest {
+
+
+ @Test
+ public void testMetadataFactoryConfig() throws Exception {
+ TikaConfig tikaConfig =
+ new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695.xml"));
+ AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+ MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+ assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+ "<mock>";
+ for (int i = 0; i < 20; i++) {
+ mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+ }
+ mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+ mock += "</mock>";
+ Metadata metadata = new Metadata();
+ List<Metadata> metadataList =
+ getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser, metadata, new ParseContext(), true);
+ assertEquals(1, metadataList.size());
+ metadata = metadataList.get(0);
+
+ String[] creators = metadata.getValues("dc:creator");
+ assertEquals(9, creators.length);
+ assertEquals("0123456", creators[8]);
+ assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+ assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+ }
+
+ @Test
+ public void testMetadataFactoryFieldsConfig() throws Exception {
+ TikaConfig tikaConfig =
+ new TikaConfig(TikaConfigTest.class.getResourceAsStream("TIKA-3695-fields.xml"));
+ AutoDetectParserConfig config = tikaConfig.getAutoDetectParserConfig();
+ MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
+ assertEquals(241, ((StandardWriteFilterFactory) factory).getMaxEstimatedBytes());
+ AutoDetectParser parser = new AutoDetectParser(tikaConfig);
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
+ "<mock>";
+ mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
+ mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
+ for (int i = 0; i < 20; i++) {
+ mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
+ }
+ mock += "<write element=\"p\" times=\"30\"> hello </write>\n";
+ mock += "</mock>";
+ Metadata metadata = new Metadata();
+ metadata.add("dc:creator", "abcdefghijabcdefghij");
+ metadata.add("not-allowed", "not-allowed");
+ List<Metadata> metadataList =
+ getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser, metadata, new ParseContext(), true);
+ assertEquals(1, metadataList.size());
+ metadata = metadataList.get(0);
+ //test that this was removed during the filter existing stage
+ assertNull(metadata.get("not-allowed"));
+ //test that this was not allowed because it isn't in the "include" list
+ assertNull(metadata.get("dc:subject"));
+
+ String[] creators = metadata.getValues("dc:creator");
+ assertEquals("abcdefghijabcdefghij", creators[0]);
+
+ //this gets more than the other test because this is filtering out X-TIKA:Parsed-By", etc.
+ assertEquals(12, creators.length);
+ assertEquals("012345", creators[11]);
+ assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
+ assertEquals("true", metadata.get(TikaCoreProperties.METADATA_LIMIT_REACHED));
+ }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
new file mode 100644
index 0000000..26e6fae
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695-fields.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>12345</spoolToDisk>
+ <outputThreshold>6789</outputThreshold>
+ </params>
+ <metadataWriteFilterFactory class="org.apache.tika.metadata.StandardWriteFilterFactory">
+ <params>
+ <maxEstimatedBytes>241</maxEstimatedBytes>
+ <includeFields>
+ <field>dc:creator</field>
+ <field>dc:title</field>
+ </includeFields>
+ </params>
+ </metadataWriteFilterFactory>
+ </autoDetectParserConfig>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
new file mode 100644
index 0000000..86ebe05
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-3695.xml
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser"/>
+ </parsers>
+ <autoDetectParserConfig>
+ <params>
+ <spoolToDisk>12345</spoolToDisk>
+ <outputThreshold>6789</outputThreshold>
+ </params>
+ <metadataWriteFilterFactory class="org.apache.tika.metadata.StandardWriteFilterFactory">
+ <params>
+ <maxEstimatedBytes>241</maxEstimatedBytes>
+ </params>
+ </metadataWriteFilterFactory>
+ </autoDetectParserConfig>
+</properties>